mdw@git.distorted.org.uk Git - sgt/halibut/blob - bk_html.c

   1 /*
   2  * HTML backend for Halibut
   3  */
   4
   5 /*
   6  * TODO:
   7  *
   8  *  - I'm never entirely convinced that having a fragment link to
   9  *    come in at the start of the real text in the file is
  10  *    sensible. Perhaps for the topmost section in the file, no
  11  *    fragment should be used? (Though it should probably still be
  12  *    _there_ even if unused.)
  13  *
  14  *  - new configurability:
  15  *     * a few new things explicitly labelled as `FIXME:
  16  *       configurable' or similar.
  17  *     * Some means of specifying the distinction between
  18  *       restrict-charset and output-charset. It seems to me that
  19  *       `html-charset' is output-charset, and that
  20  *       restrict-charset usually wants to be either output-charset
  21  *       or UTF-8 (the latter indicating that any Unicode character
  22  *       is fair game and it will be specified using &#foo; if it
  23  *       isn't in output-charset). However, since XHTML defaults to
  24  *       UTF-8 and it's fiddly to tell it otherwise, it's just
  25  *       possible that some user may need to set restrict-charset
  26  *       to their charset of choice while leaving _output_-charset
  27  *       at UTF-8. Figure out some configuration, and apply it.
  28  *
  29  *  - nonbreaking spaces.
  30  *
  31  *  - free up all the data we have allocated while running this
  32  *    backend.
  33  */
  34
  35 #include <stdio.h>
  36 #include <stdlib.h>
  37 #include <assert.h>
  38 #include <limits.h>
  39 #include "halibut.h"
  40
  41 #define is_heading_type(type) ( (type) == para_Title || \
  42                                 (type) == para_Chapter || \
  43                                 (type) == para_Appendix || \
  44                                 (type) == para_UnnumberedChapter || \
  45                                 (type) == para_Heading || \
  46                                 (type) == para_Subsect)
  47
  48 #define heading_depth(p) ( (p)->type == para_Subsect ? (p)->aux + 1 : \
  49                            (p)->type == para_Heading ? 1 : \
  50                            (p)->type == para_Title ? -1 : 0 )
  51
  52 typedef struct {
  53     int just_numbers;
  54     wchar_t *number_suffix;
  55 } sectlevel;
  56
  57 typedef struct {
  58     int nasect;
  59     sectlevel achapter, *asect;
  60     int *contents_depths;              /* 0=main, 1=chapter, 2=sect etc */
  61     int ncdepths;
  62     int address_section, visible_version_id;
  63     int leaf_contains_contents, leaf_smallest_contents;
  64     char *contents_filename;
  65     char *index_filename;
  66     char *template_filename;
  67     char *single_filename;
  68     char *template_fragment;
  69     char *head_end, *body_start, *body_end, *addr_start, *addr_end;
  70     char *body_tag, *nav_attr;
  71     wchar_t *author, *description;
  72     int restrict_charset, output_charset;
  73     enum {
  74         HTML_3_2, HTML_4, ISO_HTML,
  75         XHTML_1_0_TRANSITIONAL, XHTML_1_0_STRICT
  76     } htmlver;
  77     wchar_t *lquote, *rquote;
  78     int leaf_level;
  79 } htmlconfig;
  80
  81 #define contents_depth(conf, level) \
  82     ( (conf).ncdepths > (level) ? (conf).contents_depths[level] : (level)+2 )
  83
  84 #define is_xhtml(ver) ((ver) >= XHTML_1_0_TRANSITIONAL)
  85
  86 typedef struct htmlfile htmlfile;
  87 typedef struct htmlsect htmlsect;
  88
  89 struct htmlfile {
  90     htmlfile *next;
  91     char *filename;
  92     int last_fragment_number;
  93     int min_heading_depth;
  94     htmlsect *first, *last;            /* first/last highest-level sections */
  95 };
  96
  97 struct htmlsect {
  98     htmlsect *next, *parent;
  99     htmlfile *file;
 100     paragraph *title, *text;
 101     enum { NORMAL, TOP, INDEX } type;
 102     int contents_depth;
 103     char *fragment;
 104 };
 105
 106 typedef struct {
 107     htmlfile *head, *tail;
 108     htmlfile *single, *index;
 109     tree234 *frags;
 110 } htmlfilelist;
 111
 112 typedef struct {
 113     htmlsect *head, *tail;
 114 } htmlsectlist;
 115
 116 typedef struct {
 117     htmlfile *file;
 118     char *fragment;
 119 } htmlfragment;
 120
 121 typedef struct {
 122     int nrefs, refsize;
 123     word **refs;
 124 } htmlindex;
 125
 126 typedef struct {
 127     htmlsect *section;
 128     char *fragment;
 129     int generated, referenced;
 130 } htmlindexref;
 131
 132 typedef struct {
 133     /*
 134      * This level deals with charset conversion, starting and
 135      * ending tags, and writing to the file. It's the lexical
 136      * level.
 137      */
 138     FILE *fp;
 139     int charset;
 140     charset_state cstate;
 141     int ver;
 142     enum {
 143         HO_NEUTRAL, HO_IN_TAG, HO_IN_EMPTY_TAG, HO_IN_TEXT
 144     } state;
 145     /*
 146      * Stuff beyond here deals with the higher syntactic level: it
 147      * tracks how many levels of <ul> are currently open when
 148      * producing a contents list, for example.
 149      */
 150     int contents_level;
 151 } htmloutput;
 152
 153 static int html_fragment_compare(void *av, void *bv)
 154 {
 155     htmlfragment *a = (htmlfragment *)av;
 156     htmlfragment *b = (htmlfragment *)bv;
 157     int cmp;
 158
 159     if ((cmp = strcmp(a->file->filename, b->file->filename)) != 0)
 160         return cmp;
 161     else
 162         return strcmp(a->fragment, b->fragment);
 163 }
 164
 165 static void html_file_section(htmlconfig *cfg, htmlfilelist *files,
 166                               htmlsect *sect, int depth);
 167
 168 static htmlfile *html_new_file(htmlfilelist *list, char *filename);
 169 static htmlsect *html_new_sect(htmlsectlist *list, paragraph *title);
 170
 171 /* Flags for html_words() flags parameter */
 172 #define NOTHING 0x00
 173 #define MARKUP 0x01
 174 #define LINKS 0x02
 175 #define INDEXENTS 0x04
 176 #define ALL 0x07
 177 static void html_words(htmloutput *ho, word *words, int flags,
 178                        htmlfile *file, keywordlist *keywords, htmlconfig *cfg);
 179 static void html_codepara(htmloutput *ho, word *words);
 180
 181 static void element_open(htmloutput *ho, char const *name);
 182 static void element_close(htmloutput *ho, char const *name);
 183 static void element_empty(htmloutput *ho, char const *name);
 184 static void element_attr(htmloutput *ho, char const *name, char const *value);
 185 static void element_attr_w(htmloutput *ho, char const *name,
 186                            wchar_t const *value);
 187 static void html_text(htmloutput *ho, wchar_t const *str);
 188 static void html_text_limit(htmloutput *ho, wchar_t const *str, int maxlen);
 189 static void html_text_limit_internal(htmloutput *ho, wchar_t const *text,
 190                                      int maxlen, int quote_quotes);
 191 static void html_nl(htmloutput *ho);
 192 static void html_raw(htmloutput *ho, char *text);
 193 static void html_raw_as_attr(htmloutput *ho, char *text);
 194 static void cleanup(htmloutput *ho);
 195
 196 static void html_href(htmloutput *ho, htmlfile *thisfile,
 197                       htmlfile *targetfile, char *targetfrag);
 198 static void html_fragment(htmloutput *ho, char const *fragment);
 199
 200 static char *html_format(paragraph *p, char *template_string);
 201 static char *html_sanitise_fragment(htmlfilelist *files, htmlfile *file,
 202                                     char *text);
 203
 204 static void html_contents_entry(htmloutput *ho, int depth, htmlsect *s,
 205                                 htmlfile *thisfile, keywordlist *keywords,
 206                                 htmlconfig *cfg);
 207 static void html_section_title(htmloutput *ho, htmlsect *s,
 208                                htmlfile *thisfile, keywordlist *keywords,
 209                                htmlconfig *cfg, int real);
 210
 211 static htmlconfig html_configure(paragraph *source) {
 212     htmlconfig ret;
 213     paragraph *p;
 214
 215     /*
 216      * Defaults.
 217      */
 218     ret.leaf_level = 2;
 219     ret.achapter.just_numbers = FALSE;
 220     ret.achapter.number_suffix = L": ";
 221     ret.nasect = 1;
 222     ret.asect = snewn(ret.nasect, sectlevel);
 223     ret.asect[0].just_numbers = TRUE;
 224     ret.asect[0].number_suffix = L" ";
 225     ret.ncdepths = 0;
 226     ret.contents_depths = 0;
 227     ret.visible_version_id = TRUE;
 228     ret.address_section = TRUE;
 229     ret.leaf_contains_contents = FALSE;
 230     ret.leaf_smallest_contents = 4;
 231     ret.single_filename = dupstr("Manual.html");
 232     ret.contents_filename = dupstr("Contents.html");
 233     ret.index_filename = dupstr("IndexPage.html");
 234     ret.template_filename = dupstr("%n.html");
 235     ret.template_fragment = dupstr("%b");
 236     ret.head_end = ret.body_tag = ret.body_start = ret.body_end =
 237         ret.addr_start = ret.addr_end = ret.nav_attr = NULL;
 238     ret.author = ret.description = NULL;
 239     ret.restrict_charset = CS_ASCII;
 240     ret.output_charset = CS_ASCII;
 241     ret.htmlver = HTML_4;
 242     /*
 243      * Default quote characters are Unicode matched single quotes,
 244      * falling back to ordinary ASCII ".
 245      */
 246     ret.lquote = L"\x2018\0\x2019\0\"\0\"\0\0";
 247     ret.rquote = uadv(ret.lquote);
 248
 249     /*
 250      * Two-pass configuration so that we can pick up global config
 251      * (e.g. `quotes') before having it overridden by specific
 252      * config (`html-quotes'), irrespective of the order in which
 253      * they occur.
 254      */
 255     for (p = source; p; p = p->next) {
 256         if (p->type == para_Config) {
 257             if (!ustricmp(p->keyword, L"quotes")) {
 258                 if (*uadv(p->keyword) && *uadv(uadv(p->keyword))) {
 259                     ret.lquote = uadv(p->keyword);
 260                     ret.rquote = uadv(ret.lquote);
 261                 }
 262             }
 263         }
 264     }
 265
 266     for (p = source; p; p = p->next) {
 267         if (p->type == para_Config) {
 268             wchar_t *k = p->keyword;
 269
 270             if (!ustrnicmp(k, L"xhtml-", 6))
 271                 k++;                /* treat `xhtml-' and `html-' the same */
 272
 273             if (!ustricmp(k, L"html-charset")) {
 274                 char *csname = utoa_dup(uadv(k), CS_ASCII);
 275                 ret.restrict_charset = ret.output_charset =
 276                     charset_from_localenc(csname);
 277                 sfree(csname);
 278             } else if (!ustricmp(k, L"html-version")) {
 279                 wchar_t *vername = uadv(k);
 280                 static const struct {
 281                     const wchar_t *name;
 282                     int ver;
 283                 } versions[] = {
 284                     {L"html3.2", HTML_3_2},
 285                     {L"html4", HTML_4},
 286                     {L"iso-html", ISO_HTML},
 287                     {L"xhtml1.0transitional", XHTML_1_0_TRANSITIONAL},
 288                     {L"xhtml1.0strict", XHTML_1_0_STRICT}
 289                 };
 290                 int i;
 291
 292                 for (i = 0; i < (int)lenof(versions); i++)
 293                     if (!ustricmp(versions[i].name, vername))
 294                         break;
 295
 296                 if (i == lenof(versions))
 297                     error(err_htmlver, &p->fpos, vername);
 298                 else
 299                     ret.htmlver = versions[i].ver;
 300             } else if (!ustricmp(k, L"html-single-filename")) {
 301                 sfree(ret.single_filename);
 302                 ret.single_filename = dupstr(adv(p->origkeyword));
 303             } else if (!ustricmp(k, L"html-contents-filename")) {
 304                 sfree(ret.contents_filename);
 305                 ret.contents_filename = dupstr(adv(p->origkeyword));
 306             } else if (!ustricmp(k, L"html-index-filename")) {
 307                 sfree(ret.index_filename);
 308                 ret.index_filename = dupstr(adv(p->origkeyword));
 309             } else if (!ustricmp(k, L"html-template-filename")) {
 310                 sfree(ret.template_filename);
 311                 ret.template_filename = dupstr(adv(p->origkeyword));
 312             } else if (!ustricmp(k, L"html-template-fragment")) {
 313                 sfree(ret.template_fragment);
 314                 ret.template_fragment = dupstr(adv(p->origkeyword));
 315             } else if (!ustricmp(k, L"html-chapter-numeric")) {
 316                 ret.achapter.just_numbers = utob(uadv(k));
 317             } else if (!ustricmp(k, L"html-chapter-suffix")) {
 318                 ret.achapter.number_suffix = uadv(k);
 319             } else if (!ustricmp(k, L"html-leaf-level")) {
 320                 ret.leaf_level = utoi(uadv(k));
 321             } else if (!ustricmp(k, L"html-section-numeric")) {
 322                 wchar_t *q = uadv(k);
 323                 int n = 0;
 324                 if (uisdigit(*q)) {
 325                     n = utoi(q);
 326                     q = uadv(q);
 327                 }
 328                 if (n >= ret.nasect) {
 329                     int i;
 330                     ret.asect = sresize(ret.asect, n+1, sectlevel);
 331                     for (i = ret.nasect; i <= n; i++)
 332                         ret.asect[i] = ret.asect[ret.nasect-1];
 333                     ret.nasect = n+1;
 334                 }
 335                 ret.asect[n].just_numbers = utob(q);
 336             } else if (!ustricmp(k, L"html-section-suffix")) {
 337                 wchar_t *q = uadv(k);
 338                 int n = 0;
 339                 if (uisdigit(*q)) {
 340                     n = utoi(q);
 341                     q = uadv(q);
 342                 }
 343                 if (n >= ret.nasect) {
 344                     int i;
 345                     ret.asect = sresize(ret.asect, n+1, sectlevel);
 346                     for (i = ret.nasect; i <= n; i++) {
 347                         ret.asect[i] = ret.asect[ret.nasect-1];
 348                     }
 349                     ret.nasect = n+1;
 350                 }
 351                 ret.asect[n].number_suffix = q;
 352             } else if (!ustricmp(k, L"html-contents-depth") ||
 353                        !ustrnicmp(k, L"html-contents-depth-", 20)) {
 354                 /*
 355                  * Relic of old implementation: this directive used
 356                  * to be written as \cfg{html-contents-depth-3}{2}
 357                  * rather than the usual Halibut convention of
 358                  * \cfg{html-contents-depth}{3}{2}. We therefore
 359                  * support both.
 360                  */
 361                 wchar_t *q = k[19] ? k+20 : uadv(k);
 362                 int n = 0;
 363                 if (uisdigit(*q)) {
 364                     n = utoi(q);
 365                     q = uadv(q);
 366                 }
 367                 if (n >= ret.ncdepths) {
 368                     int i;
 369                     ret.contents_depths =
 370                         sresize(ret.contents_depths, n+1, int);
 371                     for (i = ret.ncdepths; i <= n; i++) {
 372                         ret.contents_depths[i] = i+2;
 373                     }
 374                     ret.ncdepths = n+1;
 375                 }
 376                 ret.contents_depths[n] = utoi(q);
 377             } else if (!ustricmp(k, L"html-head-end")) {
 378                 ret.head_end = adv(p->origkeyword);
 379             } else if (!ustricmp(k, L"html-body-tag")) {
 380                 ret.body_tag = adv(p->origkeyword);
 381             } else if (!ustricmp(k, L"html-body-start")) {
 382                 ret.body_start = adv(p->origkeyword);
 383             } else if (!ustricmp(k, L"html-body-end")) {
 384                 ret.body_end = adv(p->origkeyword);
 385             } else if (!ustricmp(k, L"html-address-start")) {
 386                 ret.addr_start = adv(p->origkeyword);
 387             } else if (!ustricmp(k, L"html-address-end")) {
 388                 ret.addr_end = adv(p->origkeyword);
 389             } else if (!ustricmp(k, L"html-navigation-attributes")) {
 390                 ret.nav_attr = adv(p->origkeyword);
 391             } else if (!ustricmp(k, L"html-author")) {
 392                 ret.author = uadv(k);
 393             } else if (!ustricmp(k, L"html-description")) {
 394                 ret.description = uadv(k);
 395             } else if (!ustricmp(k, L"html-suppress-address")) {
 396                 ret.address_section = !utob(uadv(k));
 397             } else if (!ustricmp(k, L"html-versionid")) {
 398                 ret.visible_version_id = utob(uadv(k));
 399             } else if (!ustricmp(k, L"html-quotes")) {
 400                 if (*uadv(k) && *uadv(uadv(k))) {
 401                     ret.lquote = uadv(k);
 402                     ret.rquote = uadv(ret.lquote);
 403                 }
 404             } else if (!ustricmp(k, L"html-leaf-contains-contents")) {
 405                 ret.leaf_contains_contents = utob(uadv(k));
 406             } else if (!ustricmp(k, L"html-leaf-smallest-contents")) {
 407                 ret.leaf_smallest_contents = utoi(uadv(k));
 408             }
 409         }
 410     }
 411
 412     /*
 413      * Now process fallbacks on quote characters.
 414      */
 415     while (*uadv(ret.rquote) && *uadv(uadv(ret.rquote)) &&
 416            (!cvt_ok(ret.restrict_charset, ret.lquote) ||
 417             !cvt_ok(ret.restrict_charset, ret.rquote))) {
 418         ret.lquote = uadv(ret.rquote);
 419         ret.rquote = uadv(ret.lquote);
 420     }
 421
 422     return ret;
 423 }
 424
 425 paragraph *html_config_filename(char *filename)
 426 {
 427     /*
 428      * If the user passes in a single filename as a parameter to
 429      * the `--html' command-line option, then we should assume it
 430      * to imply _two_ config directives:
 431      * \cfg{html-single-filename}{whatever} and
 432      * \cfg{html-leaf-level}{0}; the rationale being that the user
 433      * wants their output _in that file_.
 434      */
 435     paragraph *p, *q;
 436
 437     p = cmdline_cfg_simple("html-single-filename", filename, NULL);
 438     q = cmdline_cfg_simple("html-leaf-level", "0", NULL);
 439     p->next = q;
 440     return p;
 441 }
 442
 443 void html_backend(paragraph *sourceform, keywordlist *keywords,
 444                   indexdata *idx, void *unused) {
 445     paragraph *p;
 446     htmlconfig conf;
 447     htmlfilelist files = { NULL, NULL, NULL, NULL, NULL };
 448     htmlsectlist sects = { NULL, NULL }, nonsects = { NULL, NULL };
 449
 450     IGNORE(unused);
 451
 452     conf = html_configure(sourceform);
 453
 454     /*
 455      * We're going to make heavy use of paragraphs' private data
 456      * fields in the forthcoming code. Clear them first, so we can
 457      * reliably tell whether we have auxiliary data for a
 458      * particular paragraph.
 459      */
 460     for (p = sourceform; p; p = p->next)
 461         p->private_data = NULL;
 462
 463     files.frags = newtree234(html_fragment_compare);
 464
 465     /*
 466      * Start by figuring out into which file each piece of the
 467      * document should be put. We'll do this by inventing an
 468      * `htmlsect' structure and stashing it in the private_data
 469      * field of each section paragraph; we also need one additional
 470      * htmlsect for the document index, which won't show up in the
 471      * source form but needs to be consistently mentioned in
 472      * contents links.
 473      *
 474      * While we're here, we'll also invent the HTML fragment name
 475      * for each section.
 476      */
 477     {
 478         htmlsect *topsect, *sect;
 479         int d;
 480
 481         topsect = html_new_sect(&sects, p);
 482         topsect->type = TOP;
 483         topsect->title = NULL;
 484         topsect->text = sourceform;
 485         topsect->contents_depth = contents_depth(conf, 0);
 486         html_file_section(&conf, &files, topsect, -1);
 487         topsect->fragment = NULL;
 488
 489         for (p = sourceform; p; p = p->next)
 490             if (is_heading_type(p->type)) {
 491                 d = heading_depth(p);
 492
 493                 if (p->type == para_Title) {
 494                     topsect->title = p;
 495                     continue;
 496                 }
 497
 498                 sect = html_new_sect(&sects, p);
 499                 sect->text = p->next;
 500
 501                 sect->contents_depth = contents_depth(conf, d+1) - (d+1);
 502
 503                 if (p->parent) {
 504                     sect->parent = (htmlsect *)p->parent->private_data;
 505                     assert(sect->parent != NULL);
 506                 } else
 507                     sect->parent = topsect;
 508                 p->private_data = sect;
 509
 510                 html_file_section(&conf, &files, sect, d);
 511
 512                 sect->fragment = html_format(p, conf.template_fragment);
 513                 sect->fragment = html_sanitise_fragment(&files, sect->file,
 514                                                         sect->fragment);
 515             }
 516
 517         /* And the index. */
 518         sect = html_new_sect(&sects, NULL);
 519         sect->text = NULL;
 520         sect->type = INDEX;
 521         sect->parent = topsect;
 522         html_file_section(&conf, &files, sect, 0);   /* peer of chapters */
 523         sect->fragment = dupstr("Index");   /* FIXME: this _can't_ be right */
 524         sect->fragment = html_sanitise_fragment(&files, sect->file,
 525                                                 sect->fragment);
 526         files.index = sect->file;
 527     }
 528
 529     /*
 530      * Go through the keyword list and sort out fragment IDs for
 531      * all the potentially referenced paragraphs which _aren't_
 532      * headings.
 533      */
 534     {
 535         int i;
 536         keyword *kw;
 537         htmlsect *sect;
 538
 539         for (i = 0; (kw = index234(keywords->keys, i)) != NULL; i++) {
 540             paragraph *q, *p = kw->para;
 541
 542             if (!is_heading_type(p->type)) {
 543                 htmlsect *parent;
 544
 545                 /*
 546                  * Find the paragraph's parent htmlsect, to
 547                  * determine which file it will end up in.
 548                  */
 549                 q = p->parent;
 550                 if (!q) {
 551                     /*
 552                      * Preamble paragraphs have no parent. So if we
 553                      * have a non-heading with no parent, it must
 554                      * be preamble, and therefore its parent
 555                      * htmlsect must be the preamble one.
 556                      */
 557                     assert(sects.head &&
 558                            sects.head->type == TOP);
 559                     parent = sects.head;
 560                 } else
 561                     parent = (htmlsect *)q->private_data;
 562
 563                 /*
 564                  * Now we can construct an htmlsect for this
 565                  * paragraph itself, taking care to put it in the
 566                  * list of non-sections rather than the list of
 567                  * sections (so that traverses of the `sects' list
 568                  * won't attempt to add it to the contents or
 569                  * anything weird like that).
 570                  */
 571                 sect = html_new_sect(&nonsects, p);
 572                 sect->file = parent->file;
 573                 sect->parent = parent;
 574                 p->private_data = sect;
 575
 576                 /*
 577                  * Fragment IDs for these paragraphs will simply be
 578                  * `p' followed by an integer.
 579                  */
 580                 sect->fragment = snewn(40, char);
 581                 sprintf(sect->fragment, "p%d",
 582                         sect->file->last_fragment_number++);
 583                 sect->fragment = html_sanitise_fragment(&files, sect->file,
 584                                                         sect->fragment);
 585             }
 586         }
 587     }
 588
 589     /*
 590      * Reset the fragment numbers in each file. I've just used them
 591      * to generate `p' fragment IDs for non-section paragraphs
 592      * (numbered list elements, bibliocited), and now I want to use
 593      * them for `i' fragment IDs for index entries.
 594      */
 595     {
 596         htmlfile *file;
 597         for (file = files.head; file; file = file->next)
 598             file->last_fragment_number = 0;
 599     }
 600
 601     /*
 602      * Now sort out the index. This involves:
 603      *
 604      *  - For each index term, we set up an htmlindex structure to
 605      *    store all the references to that term.
 606      *
 607      *  - Then we make a pass over the actual document, finding
 608      *    every word_IndexRef; for each one, we actually figure out
 609      *    the HTML filename/fragment pair we will use to reference
 610      *    it, store that information in the private data field of
 611      *    the word_IndexRef itself (so we can recreate it when the
 612      *    time comes to output our HTML), and add a reference to it
 613      *    to the index term in question.
 614      */
 615     {
 616         int i;
 617         indexentry *entry;
 618         htmlsect *lastsect;
 619         word *w;
 620
 621         /*
 622          * Set up the htmlindex structures.
 623          */
 624
 625         for (i = 0; (entry = index234(idx->entries, i)) != NULL; i++) {
 626             htmlindex *hi = snew(htmlindex);
 627
 628             hi->nrefs = hi->refsize = 0;
 629             hi->refs = NULL;
 630
 631             entry->backend_data = hi;
 632         }
 633
 634         /*
 635          * Run over the document inventing fragments. Each fragment
 636          * is of the form `i' followed by an integer.
 637          */
 638         lastsect = NULL;
 639         for (p = sourceform; p; p = p->next) {
 640             if (is_heading_type(p->type))
 641                 lastsect = (htmlsect *)p->private_data;
 642
 643             for (w = p->words; w; w = w->next)
 644                 if (w->type == word_IndexRef) {
 645                     htmlindexref *hr = snew(htmlindexref);
 646                     indextag *tag;
 647                     int i;
 648
 649                     hr->referenced = hr->generated = FALSE;
 650                     hr->section = lastsect;
 651                     {
 652                         char buf[40];
 653                         sprintf(buf, "i%d",
 654                                 lastsect->file->last_fragment_number++);
 655                         hr->fragment = dupstr(buf);
 656                         hr->fragment =
 657                             html_sanitise_fragment(&files, hr->section->file,
 658                                                    hr->fragment);
 659                     }
 660                     w->private_data = hr;
 661
 662                     tag = index_findtag(idx, w->text);
 663                     if (!tag)
 664                         break;
 665
 666                     for (i = 0; i < tag->nrefs; i++) {
 667                         indexentry *entry = tag->refs[i];
 668                         htmlindex *hi = (htmlindex *)entry->backend_data;
 669
 670                         if (hi->nrefs >= hi->refsize) {
 671                             hi->refsize += 32;
 672                             hi->refs = sresize(hi->refs, hi->refsize, word *);
 673                         }
 674
 675                         hi->refs[hi->nrefs++] = w;
 676                     }
 677                 }
 678         }
 679     }
 680
 681     /*
 682      * Now we're ready to write out the actual HTML files.
 683      *
 684      * For each file:
 685      *
 686      *  - we open that file and write its header
 687      *  - we run down the list of sections
 688      *  - for each section directly contained within that file, we
 689      *    output the section text
 690      *  - for each section which is not in the file but which has a
 691      *    parent that is, we output a contents entry for the
 692      *    section if appropriate
 693      *  - finally, we output the file trailer and close the file.
 694      */
 695     {
 696         htmlfile *f, *prevf;
 697         htmlsect *s;
 698         paragraph *p;
 699
 700         prevf = NULL;
 701
 702         for (f = files.head; f; f = f->next) {
 703             htmloutput ho;
 704             int displaying;
 705             enum LISTTYPE { NOLIST, UL, OL, DL };
 706             enum ITEMTYPE { NOITEM, LI, DT, DD };
 707             struct stackelement {
 708                 struct stackelement *next;
 709                 enum LISTTYPE listtype;
 710                 enum ITEMTYPE itemtype;
 711             } *stackhead;
 712
 713 #define listname(lt) ( (lt)==UL ? "ul" : (lt)==OL ? "ol" : "dl" )
 714 #define itemname(lt) ( (lt)==LI ? "li" : (lt)==DT ? "dt" : "dd" )
 715
 716             ho.fp = fopen(f->filename, "w");
 717             ho.charset = conf.output_charset;
 718             ho.cstate = charset_init_state;
 719             ho.ver = conf.htmlver;
 720             ho.state = HO_NEUTRAL;
 721             ho.contents_level = 0;
 722
 723             /* <!DOCTYPE>. */
 724             switch (conf.htmlver) {
 725               case HTML_3_2:
 726                 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD "
 727                         "HTML 3.2 Final//EN\">\n");
 728                 break;
 729               case HTML_4:
 730                 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML"
 731                         " 4.01//EN\"\n\"http://www.w3.org/TR/html4/"
 732                         "strict.dtd\">\n");
 733                 break;
 734               case ISO_HTML:
 735                 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"ISO/IEC "
 736                         "15445:2000//DTD HTML//EN\">\n");
 737                 break;
 738               case XHTML_1_0_TRANSITIONAL:
 739                 fprintf(ho.fp, "<?xml version=\"1.0\" encoding=\"%s\"?>\n",
 740                         charset_to_mimeenc(conf.output_charset));
 741                 fprintf(ho.fp, "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML"
 742                         " 1.0 Transitional//EN\"\n\"http://www.w3.org/TR/"
 743                         "xhtml1/DTD/xhtml1-transitional.dtd\">\n");
 744                 break;
 745               case XHTML_1_0_STRICT:
 746                 fprintf(ho.fp, "<?xml version=\"1.0\" encoding=\"%s\"?>\n",
 747                         charset_to_mimeenc(conf.output_charset));
 748                 fprintf(ho.fp, "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML"
 749                         " 1.0 Strict//EN\"\n\"http://www.w3.org/TR/xhtml1/"
 750                         "DTD/xhtml1-strict.dtd\">\n");
 751                 break;
 752             }
 753
 754             element_open(&ho, "html");
 755             if (is_xhtml(conf.htmlver)) {
 756                 element_attr(&ho, "xmlns", "http://www.w3.org/1999/xhtml");
 757             }
 758             html_nl(&ho);
 759
 760             element_open(&ho, "head");
 761             html_nl(&ho);
 762
 763             element_empty(&ho, "meta");
 764             element_attr(&ho, "http-equiv", "content-type");
 765             {
 766                 char buf[200];
 767                 sprintf(buf, "text/html; charset=%.150s",
 768                         charset_to_mimeenc(conf.output_charset));
 769                 element_attr(&ho, "content", buf);
 770             }
 771             html_nl(&ho);
 772
 773             if (conf.author) {
 774                 element_empty(&ho, "meta");
 775                 element_attr(&ho, "name", "author");
 776                 element_attr_w(&ho, "content", conf.author);
 777                 html_nl(&ho);
 778             }
 779
 780             if (conf.description) {
 781                 element_empty(&ho, "meta");
 782                 element_attr(&ho, "name", "description");
 783                 element_attr_w(&ho, "content", conf.description);
 784                 html_nl(&ho);
 785             }
 786
 787             element_open(&ho, "title");
 788             if (f->first && f->first->title) {
 789                 html_words(&ho, f->first->title->words, NOTHING,
 790                            f, keywords, &conf);
 791
 792                 assert(f->last);
 793                 if (f->last != f->first && f->last->title) {
 794                     html_text(&ho, L" - ");   /* FIXME: configurable? */
 795                     html_words(&ho, f->last->title->words, NOTHING,
 796                                f, keywords, &conf);
 797                 }
 798             }
 799             element_close(&ho, "title");
 800             html_nl(&ho);
 801
 802             if (conf.head_end)
 803                 html_raw(&ho, conf.head_end);
 804
 805             element_close(&ho, "head");
 806             html_nl(&ho);
 807
 808             /* FIXME: need to be able to specify replacement for this */
 809             if (conf.body_tag)
 810                 html_raw(&ho, conf.body_tag);
 811             else
 812                 element_open(&ho, "body");
 813             html_nl(&ho);
 814
 815             if (conf.body_start)
 816                 html_raw(&ho, conf.body_start);
 817
 818             /*
 819              * Write out a nav bar. Special case: we don't do this
 820              * if there is only one file.
 821              */
 822             if (files.head != files.tail) {
 823                 element_open(&ho, "p");
 824                 if (conf.nav_attr)
 825                     html_raw_as_attr(&ho, conf.nav_attr);
 826
 827                 if (prevf) {
 828                     element_open(&ho, "a");
 829                     element_attr(&ho, "href", prevf->filename);
 830                 }
 831                 html_text(&ho, L"Previous");/* FIXME: conf? */
 832                 if (prevf)
 833                     element_close(&ho, "a");
 834
 835                 html_text(&ho, L" | ");     /* FIXME: conf? */
 836
 837                 if (f != files.head) {
 838                     element_open(&ho, "a");
 839                     element_attr(&ho, "href", files.head->filename);
 840                 }
 841                 html_text(&ho, L"Contents");/* FIXME: conf? */
 842                 if (f != files.head)
 843                     element_close(&ho, "a");
 844
 845                 html_text(&ho, L" | ");     /* FIXME: conf? */
 846
 847                 if (f != files.index) {
 848                     element_open(&ho, "a");
 849                     element_attr(&ho, "href", files.index->filename);
 850                 }
 851                 html_text(&ho, L"Index");/* FIXME: conf? */
 852                 if (f != files.index)
 853                     element_close(&ho, "a");
 854
 855                 html_text(&ho, L" | ");     /* FIXME: conf? */
 856
 857                 if (f->next) {
 858                     element_open(&ho, "a");
 859                     element_attr(&ho, "href", f->next->filename);
 860                 }
 861                 html_text(&ho, L"Next");    /* FIXME: conf? */
 862                 if (f->next)
 863                     element_close(&ho, "a");
 864
 865                 element_close(&ho, "p");
 866                 html_nl(&ho);
 867             }
 868             prevf = f;
 869
 870             /*
 871              * Write out a prefix TOC for the file.
 872              *
 873              * We start by going through the section list and
 874              * collecting the sections which need to be added to
 875              * the contents. On the way, we also test to see if
 876              * this file is a leaf file (defined as one which
 877              * contains all descendants of any section it
 878              * contains), because this will play a part in our
 879              * decision on whether or not to _output_ the TOC.
 880              *
 881              * Special case: we absolutely do not do this if we're
 882              * in single-file mode.
 883              */
 884             if (files.head != files.tail) {
 885                 int ntoc = 0, tocsize = 0;
 886                 htmlsect **toc = NULL;
 887                 int leaf = TRUE;
 888
 889                 for (s = sects.head; s; s = s->next) {
 890                     htmlsect *a, *ac;
 891                     int depth, adepth;
 892
 893                     /*
 894                      * Search up from this section until we find
 895                      * the highest-level one which belongs in this
 896                      * file.
 897                      */
 898                     depth = adepth = 0;
 899                     a = NULL;
 900                     for (ac = s; ac; ac = ac->parent) {
 901                         if (ac->file == f) {
 902                             a = ac;
 903                             adepth = depth;
 904                         }
 905                         depth++;
 906                     }
 907
 908                     if (s->file != f && a != NULL)
 909                         leaf = FALSE;
 910
 911                     if (a) {
 912                         if (adepth <= a->contents_depth) {
 913                             if (ntoc >= tocsize) {
 914                                 tocsize += 64;
 915                                 toc = sresize(toc, tocsize, htmlsect *);
 916                             }
 917                             toc[ntoc++] = s;
 918                         }
 919                     }
 920                 }
 921
 922                 if (leaf && conf.leaf_contains_contents &&
 923                     ntoc >= conf.leaf_smallest_contents) {
 924                     int i;
 925
 926                     for (i = 0; i < ntoc; i++) {
 927                         htmlsect *s = toc[i];
 928                         int hlevel = (s->type == TOP ? -1 :
 929                                       s->type == INDEX ? 0 :
 930                                       heading_depth(s->title))
 931                             - f->min_heading_depth + 1;
 932
 933                         assert(hlevel >= 1);
 934                         html_contents_entry(&ho, hlevel, s,
 935                                             f, keywords, &conf);
 936                     }
 937                     html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
 938                 }
 939             }
 940
 941             /*
 942              * Now go through the document and output some real
 943              * text.
 944              */
 945             displaying = FALSE;
 946             for (s = sects.head; s; s = s->next) {
 947                 if (s->file == f) {
 948                     /*
 949                      * This section belongs in this file.
 950                      * Display it.
 951                      */
 952                     displaying = TRUE;
 953                 } else {
 954                     htmlsect *a, *ac;
 955                     int depth, adepth;
 956
 957                     displaying = FALSE;
 958
 959                     /*
 960                      * Search up from this section until we find
 961                      * the highest-level one which belongs in this
 962                      * file.
 963                      */
 964                     depth = adepth = 0;
 965                     a = NULL;
 966                     for (ac = s; ac; ac = ac->parent) {
 967                         if (ac->file == f) {
 968                             a = ac;
 969                             adepth = depth;
 970                         }
 971                         depth++;
 972                     }
 973
 974                     if (a != NULL) {
 975                         /*
 976                          * This section does not belong in this
 977                          * file, but an ancestor of it does. Write
 978                          * out a contents table entry, if the depth
 979                          * doesn't exceed the maximum contents
 980                          * depth for the ancestor section.
 981                          */
 982                         if (adepth <= a->contents_depth) {
 983                             html_contents_entry(&ho, adepth, s,
 984                                                 f, keywords, &conf);
 985                         }
 986                     }
 987                 }
 988
 989                 if (displaying) {
 990                     int hlevel;
 991                     char htag[3];
 992
 993                     html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
 994
 995                     /*
 996                      * Display the section heading.
 997                      */
 998
 999                     hlevel = (s->type == TOP ? -1 :
1000                               s->type == INDEX ? 0 :
1001                               heading_depth(s->title))
1002                         - f->min_heading_depth + 1;
1003                     assert(hlevel >= 1);
1004                     /* HTML headings only go up to <h6> */
1005                     if (hlevel > 6)
1006                         hlevel = 6;
1007                     htag[0] = 'h';
1008                     htag[1] = '0' + hlevel;
1009                     htag[2] = '\0';
1010                     element_open(&ho, htag);
1011
1012                     /*
1013                      * Provide anchor for cross-links to target.
1014                      *
1015                      * (Also we'll have to do this separately in
1016                      * other paragraph types - NumberedList and
1017                      * BiblioCited.)
1018                      */
1019                     if (s->fragment)
1020                         html_fragment(&ho, s->fragment);
1021
1022                     html_section_title(&ho, s, f, keywords, &conf, TRUE);
1023
1024                     element_close(&ho, htag);
1025
1026                     /*
1027                      * Now display the section text.
1028                      */
1029                     if (s->text) {
1030                         stackhead = snew(struct stackelement);
1031                         stackhead->next = NULL;
1032                         stackhead->listtype = NOLIST;
1033                         stackhead->itemtype = NOITEM;
1034
1035                         for (p = s->text;; p = p->next) {
1036                             enum LISTTYPE listtype;
1037                             struct stackelement *se;
1038
1039                             /*
1040                              * Preliminary switch to figure out what
1041                              * sort of list we expect to be inside at
1042                              * this stage.
1043                              *
1044                              * Since p may still be NULL at this point,
1045                              * I invent a harmless paragraph type for
1046                              * it if it is.
1047                              */
1048                             switch (p ? p->type : para_Normal) {
1049                               case para_Rule:
1050                               case para_Normal:
1051                               case para_Copyright:
1052                               case para_BiblioCited:
1053                               case para_Code:
1054                               case para_QuotePush:
1055                               case para_QuotePop:
1056                               case para_Chapter:
1057                               case para_Appendix:
1058                               case para_UnnumberedChapter:
1059                               case para_Heading:
1060                               case para_Subsect:
1061                               case para_LcontPop:
1062                                 listtype = NOLIST;
1063                                 break;
1064
1065                               case para_Bullet:
1066                                 listtype = UL;
1067                                 break;
1068
1069                               case para_NumberedList:
1070                                 listtype = OL;
1071                                 break;
1072
1073                               case para_DescribedThing:
1074                               case para_Description:
1075                                 listtype = DL;
1076                                 break;
1077
1078                               case para_LcontPush:
1079                                 se = snew(struct stackelement);
1080                                 se->next = stackhead;
1081                                 se->listtype = NOLIST;
1082                                 se->itemtype = NOITEM;
1083                                 stackhead = se;
1084                                 continue;
1085
1086                               default:     /* some totally non-printing para */
1087                                 continue;
1088                             }
1089
1090                             html_nl(&ho);
1091
1092                             /*
1093                              * Terminate the most recent list item, if
1094                              * any. (We left this until after
1095                              * processing LcontPush, since in that case
1096                              * the list item won't want to be
1097                              * terminated until after the corresponding
1098                              * LcontPop.)
1099                              */
1100                             if (stackhead->itemtype != NOITEM) {
1101                                 element_close(&ho, itemname(stackhead->itemtype));
1102                                 html_nl(&ho);
1103                             }
1104                             stackhead->itemtype = NOITEM;
1105
1106                             /*
1107                              * Terminate the current list, if it's not
1108                              * the one we want to be in.
1109                              */
1110                             if (listtype != stackhead->listtype &&
1111                                 stackhead->listtype != NOLIST) {
1112                                 element_close(&ho, listname(stackhead->listtype));
1113                                 html_nl(&ho);
1114                             }
1115
1116                             /*
1117                              * Leave the loop if our time has come.
1118                              */
1119                             if (!p || (is_heading_type(p->type) &&
1120                                        p->type != para_Title))
1121                                 break;     /* end of section text */
1122
1123                             /*
1124                              * Start a fresh list if necessary.
1125                              */
1126                             if (listtype != stackhead->listtype &&
1127                                 listtype != NOLIST)
1128                                 element_open(&ho, listname(listtype));
1129
1130                             stackhead->listtype = listtype;
1131
1132                             switch (p->type) {
1133                               case para_Rule:
1134                                 element_empty(&ho, "hr");
1135                                 break;
1136                               case para_Code:
1137                                 html_codepara(&ho, p->words);
1138                                 break;
1139                               case para_Normal:
1140                               case para_Copyright:
1141                                 element_open(&ho, "p");
1142                                 html_nl(&ho);
1143                                 html_words(&ho, p->words, ALL,
1144                                            f, keywords, &conf);
1145                                 html_nl(&ho);
1146                                 element_close(&ho, "p");
1147                                 break;
1148                               case para_BiblioCited:
1149                                 element_open(&ho, "p");
1150                                 if (p->private_data) {
1151                                     htmlsect *s = (htmlsect *)p->private_data;
1152                                     html_fragment(&ho, s->fragment);
1153                                 }
1154                                 html_nl(&ho);
1155                                 html_words(&ho, p->kwtext, ALL,
1156                                            f, keywords, &conf);
1157                                 html_text(&ho, L" ");
1158                                 html_words(&ho, p->words, ALL,
1159                                            f, keywords, &conf);
1160                                 html_nl(&ho);
1161                                 element_close(&ho, "p");
1162                                 break;
1163                               case para_Bullet:
1164                               case para_NumberedList:
1165                                 element_open(&ho, "li");
1166                                 if (p->private_data) {
1167                                     htmlsect *s = (htmlsect *)p->private_data;
1168                                     html_fragment(&ho, s->fragment);
1169                                 }
1170                                 html_nl(&ho);
1171                                 stackhead->itemtype = LI;
1172                                 html_words(&ho, p->words, ALL,
1173                                            f, keywords, &conf);
1174                                 break;
1175                               case para_DescribedThing:
1176                                 element_open(&ho, "dt");
1177                                 html_nl(&ho);
1178                                 stackhead->itemtype = DT;
1179                                 html_words(&ho, p->words, ALL,
1180                                            f, keywords, &conf);
1181                                 break;
1182                               case para_Description:
1183                                 element_open(&ho, "dd");
1184                                 html_nl(&ho);
1185                                 stackhead->itemtype = DD;
1186                                 html_words(&ho, p->words, ALL,
1187                                            f, keywords, &conf);
1188                                 break;
1189
1190                               case para_QuotePush:
1191                                 element_open(&ho, "blockquote");
1192                                 break;
1193                               case para_QuotePop:
1194                                 element_close(&ho, "blockquote");
1195                                 break;
1196
1197                               case para_LcontPop:
1198                                 se = stackhead;
1199                                 stackhead = stackhead->next;
1200                                 assert(stackhead);
1201                                 sfree(se);
1202                                 break;
1203                             }
1204                         }
1205
1206                         assert(stackhead && !stackhead->next);
1207                         sfree(stackhead);
1208                     }
1209
1210                     if (s->type == INDEX) {
1211                         indexentry *entry;
1212                         int i;
1213
1214                         /*
1215                          * This section is the index. I'll just
1216                          * render it as a single paragraph, with a
1217                          * colon between the index term and the
1218                          * references, and <br> in between each
1219                          * entry.
1220                          */
1221                         element_open(&ho, "p");
1222
1223                         for (i = 0; (entry =
1224                                      index234(idx->entries, i)) != NULL; i++) {
1225                             htmlindex *hi = (htmlindex *)entry->backend_data;
1226                             int j;
1227
1228                             if (i > 0)
1229                                 element_empty(&ho, "br");
1230                             html_nl(&ho);
1231
1232                             html_words(&ho, entry->text, MARKUP|LINKS,
1233                                        f, keywords, &conf);
1234
1235                             html_text(&ho, L": ");/* FIXME: configurable */
1236
1237                             for (j = 0; j < hi->nrefs; j++) {
1238                                 htmlindexref *hr =
1239                                     (htmlindexref *)hi->refs[j]->private_data;
1240                                 paragraph *p = hr->section->title;
1241
1242                                 if (j > 0)
1243                                     html_text(&ho, L", "); /* FIXME: conf */
1244
1245                                 html_href(&ho, f, hr->section->file,
1246                                           hr->fragment);
1247                                 hr->referenced = TRUE;
1248                                 if (p && p->kwtext)
1249                                     html_words(&ho, p->kwtext, MARKUP|LINKS,
1250                                                f, keywords, &conf);
1251                                 else if (p && p->words)
1252                                     html_words(&ho, p->words, MARKUP|LINKS,
1253                                                f, keywords, &conf);
1254                                 else
1255                                     html_text(&ho, L"FIXME");
1256                                 element_close(&ho, "a");
1257                             }
1258                         }
1259                         element_close(&ho, "p");
1260                     }
1261                 }
1262             }
1263
1264             html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
1265             html_nl(&ho);
1266
1267             {
1268                 /*
1269                  * Footer.
1270                  */
1271                 int done_version_ids = FALSE;
1272
1273                 element_empty(&ho, "hr");
1274
1275                 if (conf.body_end)
1276                     html_raw(&ho, conf.body_end);
1277
1278                 if (conf.address_section) {
1279                     int started = FALSE;
1280                     if (conf.htmlver == ISO_HTML) {
1281                         /*
1282                          * The ISO-HTML validator complains if
1283                          * there isn't a <div> tag surrounding the
1284                          * <address> tag. I'm uncertain of why this
1285                          * should be - there appears to be no
1286                          * mention of this in the ISO-HTML spec,
1287                          * suggesting that it doesn't represent a
1288                          * change from HTML 4, but nonetheless the
1289                          * HTML 4 validator doesn't seem to mind.
1290                          */
1291                         element_open(&ho, "div");
1292                     }
1293                     element_open(&ho, "address");
1294                     if (conf.addr_start) {
1295                         html_raw(&ho, conf.addr_start);
1296                         html_nl(&ho);
1297                         started = TRUE;
1298                     }
1299                     if (conf.visible_version_id) {
1300                         for (p = sourceform; p; p = p->next)
1301                             if (p->type == para_VersionID) {
1302                                 if (started)
1303                                     element_empty(&ho, "br");
1304                                 html_nl(&ho);
1305                                 html_text(&ho, L"[");   /* FIXME: conf? */
1306                                 html_words(&ho, p->words, NOTHING,
1307                                            f, keywords, &conf);
1308                                 html_text(&ho, L"]");   /* FIXME: conf? */
1309                                 started = TRUE;
1310                             }
1311                         done_version_ids = TRUE;
1312                     }
1313                     if (conf.addr_end) {
1314                         if (started)
1315                             element_empty(&ho, "br");
1316                         html_raw(&ho, conf.addr_end);
1317                     }
1318                     element_close(&ho, "address");
1319                     if (conf.htmlver == ISO_HTML)
1320                         element_close(&ho, "div");
1321                 }
1322
1323                 if (!done_version_ids) {
1324                     /*
1325                      * If the user didn't want the version IDs
1326                      * visible, I think we still have a duty to put
1327                      * them in an HTML comment.
1328                      */
1329                     int started = FALSE;
1330                     for (p = sourceform; p; p = p->next)
1331                         if (p->type == para_VersionID) {
1332                             if (!started) {
1333                                 html_raw(&ho, "<!-- version IDs:\n");
1334                                 started = TRUE;
1335                             }
1336                             html_words(&ho, p->words, NOTHING,
1337                                        f, keywords, &conf);
1338                             html_nl(&ho);
1339                         }
1340                     if (started)
1341                         html_raw(&ho, "-->\n");
1342                 }
1343             }
1344
1345             element_close(&ho, "body");
1346             html_nl(&ho);
1347             element_close(&ho, "html");
1348             html_nl(&ho);
1349             cleanup(&ho);
1350         }
1351     }
1352
1353     /*
1354      * Go through and check that no index fragments were referenced
1355      * without being generated, or indeed vice versa.
1356      *
1357      * (When I actually get round to freeing everything, this can
1358      * probably be the freeing loop as well.)
1359      */
1360     for (p = sourceform; p; p = p->next) {
1361         word *w;
1362         for (w = p->words; w; w = w->next)
1363             if (w->type == word_IndexRef) {
1364                 htmlindexref *hr = (htmlindexref *)w->private_data;
1365
1366                 assert(!hr->referenced == !hr->generated);
1367             }
1368     }
1369
1370     /*
1371      * FIXME: Free all the working data.
1372      */
1373 }
1374
1375 static void html_file_section(htmlconfig *cfg, htmlfilelist *files,
1376                               htmlsect *sect, int depth)
1377 {
1378     htmlfile *file;
1379     int ldepth;
1380
1381     /*
1382      * `depth' is derived from the heading_depth() macro at the top
1383      * of this file, which counts title as -1, chapter as 0,
1384      * heading as 1 and subsection as 2. However, the semantics of
1385      * cfg->leaf_level are defined to count chapter as 1, heading
1386      * as 2 etc. So first I increment depth :-(
1387      */
1388     ldepth = depth + 1;
1389
1390     if (cfg->leaf_level == 0) {
1391         /*
1392          * leaf_level==0 is a special case, in which everything is
1393          * put into a single file.
1394          */
1395         if (!files->single)
1396             files->single = html_new_file(files, cfg->single_filename);
1397
1398         file = files->single;
1399     } else {
1400         /*
1401          * If the depth of this section is at or above leaf_level,
1402          * we invent a fresh file and put this section at its head.
1403          * Otherwise, we put it in the same file as its parent
1404          * section.
1405          */
1406         if (ldepth > cfg->leaf_level) {
1407             /*
1408              * We know that sect->parent cannot be NULL. The only
1409              * circumstance in which it can be is if sect is at
1410              * chapter or appendix level, i.e. ldepth==1; and if
1411              * that's the case, then we cannot have entered this
1412              * branch unless cfg->leaf_level==0, in which case we
1413              * would be in the single-file case above and not here
1414              * at all.
1415              */
1416             assert(sect->parent);
1417
1418             file = sect->parent->file;
1419         } else {
1420             if (sect->type == TOP) {
1421                 file = html_new_file(files, cfg->contents_filename);
1422             } else if (sect->type == INDEX) {
1423                 file = html_new_file(files, cfg->index_filename);
1424             } else {
1425                 char *title;
1426
1427                 assert(ldepth > 0 && sect->title);
1428                 title = html_format(sect->title, cfg->template_filename);
1429                 file = html_new_file(files, title);
1430                 sfree(title);
1431             }
1432         }
1433     }
1434
1435     sect->file = file;
1436
1437     if (file->min_heading_depth > depth) {
1438         /*
1439          * This heading is at a higher level than any heading we
1440          * have so far placed in this file; so we set the `first'
1441          * pointer.
1442          */
1443         file->min_heading_depth = depth;
1444         file->first = sect;
1445     }
1446
1447     if (file->min_heading_depth == depth)
1448         file->last = sect;
1449 }
1450
1451 static htmlfile *html_new_file(htmlfilelist *list, char *filename)
1452 {
1453     htmlfile *ret = snew(htmlfile);
1454
1455     ret->next = NULL;
1456     if (list->tail)
1457         list->tail->next = ret;
1458     else
1459         list->head = ret;
1460     list->tail = ret;
1461
1462     ret->filename = dupstr(filename);
1463     ret->last_fragment_number = 0;
1464     ret->min_heading_depth = INT_MAX;
1465     ret->first = ret->last = NULL;
1466
1467     return ret;
1468 }
1469
1470 static htmlsect *html_new_sect(htmlsectlist *list, paragraph *title)
1471 {
1472     htmlsect *ret = snew(htmlsect);
1473
1474     ret->next = NULL;
1475     if (list->tail)
1476         list->tail->next = ret;
1477     else
1478         list->head = ret;
1479     list->tail = ret;
1480
1481     ret->title = title;
1482     ret->file = NULL;
1483     ret->parent = NULL;
1484     ret->type = NORMAL;
1485
1486     return ret;
1487 }
1488
1489 static void html_words(htmloutput *ho, word *words, int flags,
1490                        htmlfile *file, keywordlist *keywords, htmlconfig *cfg)
1491 {
1492     word *w;
1493     char *c;
1494     int style, type;
1495
1496     for (w = words; w; w = w->next) switch (w->type) {
1497       case word_HyperLink:
1498         if (flags & LINKS) {
1499             element_open(ho, "a");
1500             c = utoa_dup(w->text, CS_ASCII);
1501             element_attr(ho, "href", c);
1502             sfree(c);
1503         }
1504         break;
1505       case word_UpperXref:
1506       case word_LowerXref:
1507         if (flags & LINKS) {
1508             keyword *kwl = kw_lookup(keywords, w->text);
1509             paragraph *p = kwl->para;
1510             htmlsect *s = (htmlsect *)p->private_data;
1511
1512             assert(s);
1513
1514             html_href(ho, file, s->file, s->fragment);
1515         }
1516         break;
1517       case word_HyperEnd:
1518       case word_XrefEnd:
1519         if (flags & LINKS)
1520             element_close(ho, "a");
1521         break;
1522       case word_IndexRef:
1523         if (flags & INDEXENTS) {
1524             htmlindexref *hr = (htmlindexref *)w->private_data;
1525             html_fragment(ho, hr->fragment);
1526             hr->generated = TRUE;
1527         }
1528         break;
1529       case word_Normal:
1530       case word_Emph:
1531       case word_Code:
1532       case word_WeakCode:
1533       case word_WhiteSpace:
1534       case word_EmphSpace:
1535       case word_CodeSpace:
1536       case word_WkCodeSpace:
1537       case word_Quote:
1538       case word_EmphQuote:
1539       case word_CodeQuote:
1540       case word_WkCodeQuote:
1541         style = towordstyle(w->type);
1542         type = removeattr(w->type);
1543         if (style == word_Emph &&
1544             (attraux(w->aux) == attr_First ||
1545              attraux(w->aux) == attr_Only) &&
1546             (flags & MARKUP))
1547             element_open(ho, "em");
1548         else if ((style == word_Code || style == word_WeakCode) &&
1549                  (attraux(w->aux) == attr_First ||
1550                   attraux(w->aux) == attr_Only) &&
1551                  (flags & MARKUP))
1552             element_open(ho, "code");
1553
1554         if (type == word_WhiteSpace)
1555             html_text(ho, L" ");
1556         else if (type == word_Quote) {
1557             if (quoteaux(w->aux) == quote_Open)
1558                 html_text(ho, cfg->lquote);
1559             else
1560                 html_text(ho, cfg->rquote);
1561         } else {
1562             if (cvt_ok(ho->charset, w->text) || !w->alt)
1563                 html_text(ho, w->text);
1564             else
1565                 html_words(ho, w->alt, flags, file, keywords, cfg);
1566         }
1567
1568         if (style == word_Emph &&
1569             (attraux(w->aux) == attr_Last ||
1570              attraux(w->aux) == attr_Only) &&
1571             (flags & MARKUP))
1572             element_close(ho, "em");
1573         else if ((style == word_Code || style == word_WeakCode) &&
1574                  (attraux(w->aux) == attr_Last ||
1575                   attraux(w->aux) == attr_Only) &&
1576                  (flags & MARKUP))
1577             element_close(ho, "code");
1578
1579         break;
1580     }
1581 }
1582
1583 static void html_codepara(htmloutput *ho, word *words)
1584 {
1585     element_open(ho, "pre");
1586     element_open(ho, "code");
1587     for (; words; words = words->next) if (words->type == word_WeakCode) {
1588         char *open_tag;
1589         wchar_t *t, *e;
1590
1591         t = words->text;
1592         if (words->next && words->next->type == word_Emph) {
1593             e = words->next->text;
1594             words = words->next;
1595         } else
1596             e = NULL;
1597
1598         while (e && *e && *t) {
1599             int n;
1600             int ec = *e;
1601
1602             for (n = 0; t[n] && e[n] && e[n] == ec; n++);
1603
1604             open_tag = NULL;
1605             if (ec == 'i')
1606                 open_tag = "em";
1607             else if (ec == 'b')
1608                 open_tag = "b";
1609             if (open_tag)
1610                 element_open(ho, open_tag);
1611
1612             html_text_limit(ho, t, n);
1613
1614             if (open_tag)
1615                 element_close(ho, open_tag);
1616
1617             t += n;
1618             e += n;
1619         }
1620         html_text(ho, t);
1621         html_nl(ho);
1622     }
1623     element_close(ho, "code");
1624     element_close(ho, "pre");
1625 }
1626
1627 static void html_charset_cleanup(htmloutput *ho)
1628 {
1629     char outbuf[256];
1630     int bytes;
1631
1632     bytes = charset_from_unicode(NULL, NULL, outbuf, lenof(outbuf),
1633                                  ho->charset, &ho->cstate, NULL);
1634     if (bytes > 0)
1635         fwrite(outbuf, 1, bytes, ho->fp);
1636 }
1637
1638 static void return_to_neutral(htmloutput *ho)
1639 {
1640     if (ho->state == HO_IN_TEXT) {
1641         html_charset_cleanup(ho);
1642     } else if (ho->state == HO_IN_EMPTY_TAG && is_xhtml(ho->ver)) {
1643         fprintf(ho->fp, " />");
1644     } else if (ho->state == HO_IN_EMPTY_TAG || ho->state == HO_IN_TAG) {
1645         fprintf(ho->fp, ">");
1646     }
1647
1648     ho->state = HO_NEUTRAL;
1649 }
1650
1651 static void element_open(htmloutput *ho, char const *name)
1652 {
1653     return_to_neutral(ho);
1654     fprintf(ho->fp, "<%s", name);
1655     ho->state = HO_IN_TAG;
1656 }
1657
1658 static void element_close(htmloutput *ho, char const *name)
1659 {
1660     return_to_neutral(ho);
1661     fprintf(ho->fp, "</%s>", name);
1662     ho->state = HO_NEUTRAL;
1663 }
1664
1665 static void element_empty(htmloutput *ho, char const *name)
1666 {
1667     return_to_neutral(ho);
1668     fprintf(ho->fp, "<%s", name);
1669     ho->state = HO_IN_EMPTY_TAG;
1670 }
1671
1672 static void html_nl(htmloutput *ho)
1673 {
1674     return_to_neutral(ho);
1675     fputc('\n', ho->fp);
1676 }
1677
1678 static void html_raw(htmloutput *ho, char *text)
1679 {
1680     return_to_neutral(ho);
1681     fputs(text, ho->fp);
1682 }
1683
1684 static void html_raw_as_attr(htmloutput *ho, char *text)
1685 {
1686     assert(ho->state == HO_IN_TAG || ho->state == HO_IN_EMPTY_TAG);
1687     fputc(' ', ho->fp);
1688     fputs(text, ho->fp);
1689 }
1690
1691 static void element_attr(htmloutput *ho, char const *name, char const *value)
1692 {
1693     html_charset_cleanup(ho);
1694     assert(ho->state == HO_IN_TAG || ho->state == HO_IN_EMPTY_TAG);
1695     fprintf(ho->fp, " %s=\"%s\"", name, value);
1696 }
1697
1698 static void element_attr_w(htmloutput *ho, char const *name,
1699                            wchar_t const *value)
1700 {
1701     html_charset_cleanup(ho);
1702     fprintf(ho->fp, " %s=\"", name);
1703     html_text_limit_internal(ho, value, 0, TRUE);
1704     html_charset_cleanup(ho);
1705     fputc('"', ho->fp);
1706 }
1707
1708 static void html_text(htmloutput *ho, wchar_t const *text)
1709 {
1710     html_text_limit(ho, text, 0);
1711 }
1712
1713 static void html_text_limit(htmloutput *ho, wchar_t const *text, int maxlen)
1714 {
1715     return_to_neutral(ho);
1716     html_text_limit_internal(ho, text, maxlen, FALSE);
1717 }
1718
1719 static void html_text_limit_internal(htmloutput *ho, wchar_t const *text,
1720                                      int maxlen, int quote_quotes)
1721 {
1722     int textlen = ustrlen(text);
1723     char outbuf[256];
1724     int bytes, err;
1725
1726     if (maxlen > 0 && textlen > maxlen)
1727         textlen = maxlen;
1728
1729     while (textlen > 0) {
1730         /* Scan ahead for characters we really can't display in HTML. */
1731         int lenbefore, lenafter;
1732         for (lenbefore = 0; lenbefore < textlen; lenbefore++)
1733             if (text[lenbefore] == L'<' ||
1734                 text[lenbefore] == L'>' ||
1735                 text[lenbefore] == L'&' ||
1736                 (text[lenbefore] == L'"' && quote_quotes))
1737                 break;
1738         lenafter = lenbefore;
1739         bytes = charset_from_unicode(&text, &lenafter, outbuf, lenof(outbuf),
1740                                      ho->charset, &ho->cstate, &err);
1741         textlen -= (lenbefore - lenafter);
1742         if (bytes > 0)
1743             fwrite(outbuf, 1, bytes, ho->fp);
1744         if (err) {
1745             /*
1746              * We have encountered a character that cannot be
1747              * displayed in the selected output charset. Therefore,
1748              * we use an HTML numeric entity reference.
1749              */
1750             assert(textlen > 0);
1751             fprintf(ho->fp, "&#%ld;", (long int)*text);
1752             text++, textlen--;
1753         } else if (lenafter == 0 && textlen > 0) {
1754             /*
1755              * We have encountered a character which is special to
1756              * HTML.
1757              */
1758             if (*text == L'<')
1759                 fprintf(ho->fp, "&lt;");
1760             else if (*text == L'>')
1761                 fprintf(ho->fp, "&gt;");
1762             else if (*text == L'&')
1763                 fprintf(ho->fp, "&amp;");
1764             else if (*text == L'"')
1765                 fprintf(ho->fp, "&quot;");
1766             else
1767                 assert(!"Can't happen");
1768             text++, textlen--;
1769         }
1770     }
1771 }
1772
1773 static void cleanup(htmloutput *ho)
1774 {
1775     return_to_neutral(ho);
1776     fclose(ho->fp);
1777 }
1778
1779 static void html_href(htmloutput *ho, htmlfile *thisfile,
1780                       htmlfile *targetfile, char *targetfrag)
1781 {
1782     rdstringc rs = { 0, 0, NULL };
1783     char *url;
1784
1785     if (targetfile != thisfile)
1786         rdaddsc(&rs, targetfile->filename);
1787     if (targetfrag) {
1788         rdaddc(&rs, '#');
1789         rdaddsc(&rs, targetfrag);
1790     }
1791     url = rs.text;
1792
1793     element_open(ho, "a");
1794     element_attr(ho, "href", url);
1795     sfree(url);
1796 }
1797
1798 static void html_fragment(htmloutput *ho, char const *fragment)
1799 {
1800     element_open(ho, "a");
1801     element_attr(ho, "name", fragment);
1802     if (is_xhtml(ho->ver))
1803         element_attr(ho, "id", fragment);
1804     element_close(ho, "a");
1805 }
1806
1807 static char *html_format(paragraph *p, char *template_string)
1808 {
1809     char *c, *t;
1810     word *w;
1811     wchar_t *ws, wsbuf[2];
1812     rdstringc rs = { 0, 0, NULL };
1813
1814     t = template_string;
1815     while (*t) {
1816         if (*t == '%' && t[1]) {
1817             int fmt;
1818
1819             t++;
1820             fmt = *t++;
1821
1822             if (fmt == '%') {
1823                 rdaddc(&rs, fmt);
1824                 continue;
1825             }
1826
1827             w = NULL;
1828             ws = NULL;
1829
1830             if (p->kwtext && fmt == 'n')
1831                 w = p->kwtext;
1832             else if (p->kwtext2 && fmt == 'b') {
1833                 /*
1834                  * HTML fragment names must start with a letter, so
1835                  * simply `1.2.3' is not adequate. In this case I'm
1836                  * going to cheat slightly by prepending the first
1837                  * character of the first word of kwtext, so that
1838                  * we get `C1' for chapter 1, `S2.3' for section
1839                  * 2.3 etc.
1840                  */
1841                 if (p->kwtext && p->kwtext->text[0]) {
1842                     ws = wsbuf;
1843                     wsbuf[1] = '\0';
1844                     wsbuf[0] = p->kwtext->text[0];
1845                 }
1846                 w = p->kwtext2;
1847             } else if (p->keyword && *p->keyword && fmt == 'k')
1848                 ws = p->keyword;
1849             else
1850                 w = p->words;
1851
1852             if (ws) {
1853                 c = utoa_dup(ws, CS_ASCII);
1854                 rdaddsc(&rs,c);
1855                 sfree(c);
1856             }
1857
1858             while (w) {
1859                 if (removeattr(w->type) == word_Normal) {
1860                     c = utoa_dup(w->text, CS_ASCII);
1861                     rdaddsc(&rs,c);
1862                     sfree(c);
1863                 }
1864                 w = w->next;
1865             }
1866         } else {
1867             rdaddc(&rs, *t++);
1868         }
1869     }
1870
1871     return rdtrimc(&rs);
1872 }
1873
1874 static char *html_sanitise_fragment(htmlfilelist *files, htmlfile *file,
1875                                     char *text)
1876 {
1877     /*
1878      * The HTML 4 spec's strictest definition of fragment names (<a
1879      * name> and "id" attributes) says that they `must begin with a
1880      * letter and may be followed by any number of letters, digits,
1881      * hyphens, underscores, colons, and periods'.
1882      *
1883      * So here we unceremoniously rip out any characters not
1884      * conforming to this limitation.
1885      */
1886     char *p = text, *q = text;
1887
1888     while (*p && !((*p>='A' && *p<='Z') || (*p>='a' && *p<='z')))
1889         p++;
1890     if ((*q++ = *p++) != '\0') {
1891         while (*p) {
1892             if ((*p>='A' && *p<='Z') ||
1893                 (*p>='a' && *p<='z') ||
1894                 (*p>='0' && *p<='9') ||
1895                 *p=='-' || *p=='_' || *p==':' || *p=='.')
1896                 *q++ = *p;
1897             p++;
1898         }
1899
1900         *q = '\0';
1901     }
1902
1903     /*
1904      * Now we check for clashes with other fragment names, and
1905      * adjust this one if necessary by appending a hyphen followed
1906      * by a number.
1907      */
1908     {
1909         htmlfragment *frag = snew(htmlfragment);
1910         int len = 0;                   /* >0 indicates we have resized */
1911         int suffix = 1;
1912
1913         frag->file = file;
1914         frag->fragment = text;
1915
1916         while (add234(files->frags, frag) != frag) {
1917             if (!len) {
1918                 len = strlen(text);
1919                 frag->fragment = text = sresize(text, len+20, char);
1920             }
1921
1922             sprintf(text + len, "-%d", ++suffix);
1923         }
1924     }
1925
1926     return text;
1927 }
1928
1929 static void html_contents_entry(htmloutput *ho, int depth, htmlsect *s,
1930                                 htmlfile *thisfile, keywordlist *keywords,
1931                                 htmlconfig *cfg)
1932 {
1933     while (ho->contents_level > depth) {
1934         element_close(ho, "ul");
1935         ho->contents_level--;
1936     }
1937
1938     while (ho->contents_level < depth) {
1939         element_open(ho, "ul");
1940         ho->contents_level++;
1941     }
1942
1943     if (!s)
1944         return;
1945
1946     element_open(ho, "li");
1947     html_href(ho, thisfile, s->file, s->fragment);
1948     html_section_title(ho, s, thisfile, keywords, cfg, FALSE);
1949     element_close(ho, "a");
1950     element_close(ho, "li");
1951 }
1952
1953 static void html_section_title(htmloutput *ho, htmlsect *s, htmlfile *thisfile,
1954                                keywordlist *keywords, htmlconfig *cfg,
1955                                int real)
1956 {
1957     if (s->title) {
1958         sectlevel *sl;
1959         word *number;
1960         int depth = heading_depth(s->title);
1961
1962         if (depth < 0)
1963             sl = NULL;
1964         else if (depth == 0)
1965             sl = &cfg->achapter;
1966         else if (depth <= cfg->nasect)
1967             sl = &cfg->asect[depth-1];
1968         else
1969             sl = &cfg->asect[cfg->nasect-1];
1970
1971         if (!sl)
1972             number = NULL;
1973         else if (sl->just_numbers)
1974             number = s->title->kwtext2;
1975         else
1976             number = s->title->kwtext;
1977
1978         if (number) {
1979             html_words(ho, number, MARKUP,
1980                        thisfile, keywords, cfg);
1981             html_text(ho, sl->number_suffix);
1982         }
1983
1984         html_words(ho, s->title->words, real ? ALL : MARKUP,
1985                    thisfile, keywords, cfg);
1986     } else {
1987         assert(s->type != NORMAL);
1988         if (s->type == TOP)
1989             html_text(ho, L"Preamble");/* FIXME: configure */
1990         else if (s->type == INDEX)
1991             html_text(ho, L"Index");/* FIXME: configure */
1992     }
1993 }