mdw@git.distorted.org.uk Git - sgt/halibut/blob - bk_html.c

   1 /*
   2  * HTML backend for Halibut
   3  */
   4
   5 /*
   6  * TODO:
   7  *
   8  *  - I'm never entirely convinced that having a fragment link to
   9  *    come in at the start of the real text in the file is
  10  *    sensible. Perhaps for the topmost section in the file, no
  11  *    fragment should be used? (Though it should probably still be
  12  *    _there_ even if unused.)
  13  *
  14  *  - new configurability:
  15  *     * a few new things explicitly labelled as `FIXME:
  16  *       configurable' or similar.
  17  *     * HTML flavour.
  18  *     * Some means of specifying the distinction between
  19  *       restrict-charset and output-charset. It seems to me that
  20  *       `html-charset' is output-charset, and that
  21  *       restrict-charset usually wants to be either output-charset
  22  *       or UTF-8 (the latter indicating that any Unicode character
  23  *       is fair game and it will be specified using &#foo; if it
  24  *       isn't in output-charset). However, since XHTML defaults to
  25  *       UTF-8 and it's fiddly to tell it otherwise, it's just
  26  *       possible that some user may need to set restrict-charset
  27  *       to their charset of choice while leaving _output_-charset
  28  *       at UTF-8. Figure out some configuration, and apply it.
  29  *
  30  *  - test all HTML flavours and ensure they validate sensibly. Fix
  31  *    remaining confusion issues such as <?xml?> and obsoleteness
  32  *    of <a name>.
  33  *
  34  *  - proper naming of all fragment IDs. The ones for sections are
  35  *    fine; the ones for numbered list and bibliociteds are utter
  36  *    crap; the ones for indexes _might_ do but it might be worth
  37  *    giving some thought to how to do them better.
  38  *
  39  *  - nonbreaking spaces.
  40  *
  41  *  - free up all the data we have allocated while running this
  42  *    backend.
  43  */
  44
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <assert.h>
  48 #include <limits.h>
  49 #include "halibut.h"
  50
  51 #define is_heading_type(type) ( (type) == para_Title || \
  52                                 (type) == para_Chapter || \
  53                                 (type) == para_Appendix || \
  54                                 (type) == para_UnnumberedChapter || \
  55                                 (type) == para_Heading || \
  56                                 (type) == para_Subsect)
  57
  58 #define heading_depth(p) ( (p)->type == para_Subsect ? (p)->aux + 1 : \
  59                            (p)->type == para_Heading ? 1 : \
  60                            (p)->type == para_Title ? -1 : 0 )
  61
  62 typedef struct {
  63     int just_numbers;
  64     wchar_t *number_suffix;
  65 } sectlevel;
  66
  67 typedef struct {
  68     int nasect;
  69     sectlevel achapter, *asect;
  70     int *contents_depths;              /* 0=main, 1=chapter, 2=sect etc */
  71     int ncdepths;
  72     int address_section, visible_version_id;
  73     int leaf_contains_contents, leaf_smallest_contents;
  74     char *contents_filename;
  75     char *index_filename;
  76     char *template_filename;
  77     char *single_filename;
  78     char *template_fragment;
  79     char *head_end, *body_start, *body_end, *addr_start, *addr_end;
  80     char *body_tag, *nav_attr;
  81     wchar_t *author, *description;
  82     int restrict_charset, output_charset;
  83     enum {
  84         HTML_3_2, HTML_4,
  85         XHTML_1_0_TRANSITIONAL, XHTML_1_0_STRICT
  86     } htmlver;
  87     wchar_t *lquote, *rquote;
  88     int leaf_level;
  89 } htmlconfig;
  90
  91 #define contents_depth(conf, level) \
  92     ( (conf).ncdepths > (level) ? (conf).contents_depths[level] : (level)+2 )
  93
  94 #define is_xhtml(ver) ((ver) >= XHTML_1_0_TRANSITIONAL)
  95
  96 typedef struct htmlfile htmlfile;
  97 typedef struct htmlsect htmlsect;
  98
  99 struct htmlfile {
 100     htmlfile *next;
 101     char *filename;
 102     int last_fragment_number;
 103     int min_heading_depth;
 104     htmlsect *first, *last;            /* first/last highest-level sections */
 105 };
 106
 107 struct htmlsect {
 108     htmlsect *next, *parent;
 109     htmlfile *file;
 110     paragraph *title, *text;
 111     enum { NORMAL, TOP, INDEX } type;
 112     int contents_depth;
 113     char *fragment;
 114 };
 115
 116 typedef struct {
 117     htmlfile *head, *tail;
 118     htmlfile *single, *index;
 119     tree234 *frags;
 120 } htmlfilelist;
 121
 122 typedef struct {
 123     htmlsect *head, *tail;
 124 } htmlsectlist;
 125
 126 typedef struct {
 127     htmlfile *file;
 128     char *fragment;
 129 } htmlfragment;
 130
 131 typedef struct {
 132     int nrefs, refsize;
 133     word **refs;
 134 } htmlindex;
 135
 136 typedef struct {
 137     htmlsect *section;
 138     char *fragment;
 139 } htmlindexref;
 140
 141 typedef struct {
 142     /*
 143      * This level deals with charset conversion, starting and
 144      * ending tags, and writing to the file. It's the lexical
 145      * level.
 146      */
 147     FILE *fp;
 148     int charset;
 149     charset_state cstate;
 150     int ver;
 151     enum {
 152         HO_NEUTRAL, HO_IN_TAG, HO_IN_EMPTY_TAG, HO_IN_TEXT
 153     } state;
 154     /*
 155      * Stuff beyond here deals with the higher syntactic level: it
 156      * tracks how many levels of <ul> are currently open when
 157      * producing a contents list, for example.
 158      */
 159     int contents_level;
 160 } htmloutput;
 161
 162 static int html_fragment_compare(void *av, void *bv)
 163 {
 164     htmlfragment *a = (htmlfragment *)av;
 165     htmlfragment *b = (htmlfragment *)bv;
 166     int cmp;
 167
 168     if ((cmp = strcmp(a->file->filename, b->file->filename)) != 0)
 169         return cmp;
 170     else
 171         return strcmp(a->fragment, b->fragment);
 172 }
 173
 174 static void html_file_section(htmlconfig *cfg, htmlfilelist *files,
 175                               htmlsect *sect, int depth);
 176
 177 static htmlfile *html_new_file(htmlfilelist *list, char *filename);
 178 static htmlsect *html_new_sect(htmlsectlist *list, paragraph *title);
 179
 180 /* Flags for html_words() flags parameter */
 181 #define NOTHING 0x00
 182 #define MARKUP 0x01
 183 #define LINKS 0x02
 184 #define INDEXENTS 0x04
 185 #define ALL 0x07
 186 static void html_words(htmloutput *ho, word *words, int flags,
 187                        htmlfile *file, keywordlist *keywords, htmlconfig *cfg);
 188 static void html_codepara(htmloutput *ho, word *words);
 189
 190 static void element_open(htmloutput *ho, char const *name);
 191 static void element_close(htmloutput *ho, char const *name);
 192 static void element_empty(htmloutput *ho, char const *name);
 193 static void element_attr(htmloutput *ho, char const *name, char const *value);
 194 static void element_attr_w(htmloutput *ho, char const *name,
 195                            wchar_t const *value);
 196 static void html_text(htmloutput *ho, wchar_t const *str);
 197 static void html_text_limit(htmloutput *ho, wchar_t const *str, int maxlen);
 198 static void html_text_limit_internal(htmloutput *ho, wchar_t const *text,
 199                                      int maxlen, int quote_quotes);
 200 static void html_nl(htmloutput *ho);
 201 static void html_raw(htmloutput *ho, char *text);
 202 static void html_raw_as_attr(htmloutput *ho, char *text);
 203 static void cleanup(htmloutput *ho);
 204
 205 static void html_href(htmloutput *ho, htmlfile *thisfile,
 206                       htmlfile *targetfile, char *targetfrag);
 207
 208 static char *html_format(paragraph *p, char *template_string);
 209 static char *html_sanitise_fragment(htmlfilelist *files, htmlfile *file,
 210                                     char *text);
 211
 212 static void html_contents_entry(htmloutput *ho, int depth, htmlsect *s,
 213                                 htmlfile *thisfile, keywordlist *keywords,
 214                                 htmlconfig *cfg);
 215 static void html_section_title(htmloutput *ho, htmlsect *s,
 216                                htmlfile *thisfile, keywordlist *keywords,
 217                                htmlconfig *cfg, int real);
 218
 219 static htmlconfig html_configure(paragraph *source) {
 220     htmlconfig ret;
 221     paragraph *p;
 222
 223     /*
 224      * Defaults.
 225      */
 226     ret.leaf_level = 2;
 227     ret.achapter.just_numbers = FALSE;
 228     ret.achapter.number_suffix = L": ";
 229     ret.nasect = 1;
 230     ret.asect = snewn(ret.nasect, sectlevel);
 231     ret.asect[0].just_numbers = TRUE;
 232     ret.asect[0].number_suffix = L" ";
 233     ret.ncdepths = 0;
 234     ret.contents_depths = 0;
 235     ret.visible_version_id = TRUE;
 236     ret.address_section = TRUE;
 237     ret.leaf_contains_contents = FALSE;
 238     ret.leaf_smallest_contents = 4;
 239     ret.single_filename = dupstr("Manual.html");
 240     ret.contents_filename = dupstr("Contents.html");
 241     ret.index_filename = dupstr("IndexPage.html");
 242     ret.template_filename = dupstr("%n.html");
 243     ret.template_fragment = dupstr("%b");
 244     ret.head_end = ret.body_tag = ret.body_start = ret.body_end =
 245         ret.addr_start = ret.addr_end = ret.nav_attr = NULL;
 246     ret.author = ret.description = NULL;
 247     ret.restrict_charset = CS_ASCII;
 248     ret.output_charset = CS_ASCII;
 249     ret.htmlver = HTML_4;
 250     /*
 251      * Default quote characters are Unicode matched single quotes,
 252      * falling back to ordinary ASCII ".
 253      */
 254     ret.lquote = L"\x2018\0\x2019\0\"\0\"\0\0";
 255     ret.rquote = uadv(ret.lquote);
 256
 257     /*
 258      * Two-pass configuration so that we can pick up global config
 259      * (e.g. `quotes') before having it overridden by specific
 260      * config (`html-quotes'), irrespective of the order in which
 261      * they occur.
 262      */
 263     for (p = source; p; p = p->next) {
 264         if (p->type == para_Config) {
 265             if (!ustricmp(p->keyword, L"quotes")) {
 266                 if (*uadv(p->keyword) && *uadv(uadv(p->keyword))) {
 267                     ret.lquote = uadv(p->keyword);
 268                     ret.rquote = uadv(ret.lquote);
 269                 }
 270             }
 271         }
 272     }
 273
 274     for (p = source; p; p = p->next) {
 275         if (p->type == para_Config) {
 276             wchar_t *k = p->keyword;
 277
 278             if (!ustrnicmp(k, L"xhtml-", 6))
 279                 k++;                /* treat `xhtml-' and `html-' the same */
 280
 281             if (!ustricmp(k, L"html-charset")) {
 282                 char *csname = utoa_dup(uadv(k), CS_ASCII);
 283                 ret.restrict_charset = ret.output_charset =
 284                     charset_from_localenc(csname);
 285                 sfree(csname);
 286             } else if (!ustricmp(k, L"html-single-filename")) {
 287                 sfree(ret.single_filename);
 288                 ret.single_filename = dupstr(adv(p->origkeyword));
 289             } else if (!ustricmp(k, L"html-contents-filename")) {
 290                 sfree(ret.contents_filename);
 291                 ret.contents_filename = dupstr(adv(p->origkeyword));
 292             } else if (!ustricmp(k, L"html-index-filename")) {
 293                 sfree(ret.index_filename);
 294                 ret.index_filename = dupstr(adv(p->origkeyword));
 295             } else if (!ustricmp(k, L"html-template-filename")) {
 296                 sfree(ret.template_filename);
 297                 ret.template_filename = dupstr(adv(p->origkeyword));
 298             } else if (!ustricmp(k, L"html-template-fragment")) {
 299                 sfree(ret.template_fragment);
 300                 ret.template_fragment = dupstr(adv(p->origkeyword));
 301             } else if (!ustricmp(k, L"html-chapter-numeric")) {
 302                 ret.achapter.just_numbers = utob(uadv(k));
 303             } else if (!ustricmp(k, L"html-chapter-suffix")) {
 304                 ret.achapter.number_suffix = uadv(k);
 305             } else if (!ustricmp(k, L"html-leaf-level")) {
 306                 ret.leaf_level = utoi(uadv(k));
 307             } else if (!ustricmp(k, L"html-section-numeric")) {
 308                 wchar_t *q = uadv(k);
 309                 int n = 0;
 310                 if (uisdigit(*q)) {
 311                     n = utoi(q);
 312                     q = uadv(q);
 313                 }
 314                 if (n >= ret.nasect) {
 315                     int i;
 316                     ret.asect = sresize(ret.asect, n+1, sectlevel);
 317                     for (i = ret.nasect; i <= n; i++)
 318                         ret.asect[i] = ret.asect[ret.nasect-1];
 319                     ret.nasect = n+1;
 320                 }
 321                 ret.asect[n].just_numbers = utob(q);
 322             } else if (!ustricmp(k, L"html-section-suffix")) {
 323                 wchar_t *q = uadv(k);
 324                 int n = 0;
 325                 if (uisdigit(*q)) {
 326                     n = utoi(q);
 327                     q = uadv(q);
 328                 }
 329                 if (n >= ret.nasect) {
 330                     int i;
 331                     ret.asect = sresize(ret.asect, n+1, sectlevel);
 332                     for (i = ret.nasect; i <= n; i++) {
 333                         ret.asect[i] = ret.asect[ret.nasect-1];
 334                     }
 335                     ret.nasect = n+1;
 336                 }
 337                 ret.asect[n].number_suffix = q;
 338             } else if (!ustricmp(k, L"html-contents-depth") ||
 339                        !ustrnicmp(k, L"html-contents-depth-", 20)) {
 340                 /*
 341                  * Relic of old implementation: this directive used
 342                  * to be written as \cfg{html-contents-depth-3}{2}
 343                  * rather than the usual Halibut convention of
 344                  * \cfg{html-contents-depth}{3}{2}. We therefore
 345                  * support both.
 346                  */
 347                 wchar_t *q = k[19] ? k+20 : uadv(k);
 348                 int n = 0;
 349                 if (uisdigit(*q)) {
 350                     n = utoi(q);
 351                     q = uadv(q);
 352                 }
 353                 if (n >= ret.ncdepths) {
 354                     int i;
 355                     ret.contents_depths =
 356                         sresize(ret.contents_depths, n+1, int);
 357                     for (i = ret.ncdepths; i <= n; i++) {
 358                         ret.contents_depths[i] = i+2;
 359                     }
 360                     ret.ncdepths = n+1;
 361                 }
 362                 ret.contents_depths[n] = utoi(q);
 363             } else if (!ustricmp(k, L"html-head-end")) {
 364                 ret.head_end = adv(p->origkeyword);
 365             } else if (!ustricmp(k, L"html-body-tag")) {
 366                 ret.body_tag = adv(p->origkeyword);
 367             } else if (!ustricmp(k, L"html-body-start")) {
 368                 ret.body_start = adv(p->origkeyword);
 369             } else if (!ustricmp(k, L"html-body-end")) {
 370                 ret.body_end = adv(p->origkeyword);
 371             } else if (!ustricmp(k, L"html-address-start")) {
 372                 ret.addr_start = adv(p->origkeyword);
 373             } else if (!ustricmp(k, L"html-address-end")) {
 374                 ret.addr_end = adv(p->origkeyword);
 375             } else if (!ustricmp(k, L"html-navigation-attributes")) {
 376                 ret.nav_attr = adv(p->origkeyword);
 377             } else if (!ustricmp(k, L"html-author")) {
 378                 ret.author = uadv(k);
 379             } else if (!ustricmp(k, L"html-description")) {
 380                 ret.description = uadv(k);
 381             } else if (!ustricmp(k, L"html-suppress-address")) {
 382                 ret.address_section = !utob(uadv(k));
 383             } else if (!ustricmp(k, L"html-versionid")) {
 384                 ret.visible_version_id = utob(uadv(k));
 385             } else if (!ustricmp(k, L"html-quotes")) {
 386                 if (*uadv(k) && *uadv(uadv(k))) {
 387                     ret.lquote = uadv(k);
 388                     ret.rquote = uadv(ret.lquote);
 389                 }
 390             } else if (!ustricmp(k, L"html-leaf-contains-contents")) {
 391                 ret.leaf_contains_contents = utob(uadv(k));
 392             } else if (!ustricmp(k, L"html-leaf-smallest-contents")) {
 393                 ret.leaf_smallest_contents = utoi(uadv(k));
 394             }
 395         }
 396     }
 397
 398     /*
 399      * Now process fallbacks on quote characters.
 400      */
 401     while (*uadv(ret.rquote) && *uadv(uadv(ret.rquote)) &&
 402            (!cvt_ok(ret.restrict_charset, ret.lquote) ||
 403             !cvt_ok(ret.restrict_charset, ret.rquote))) {
 404         ret.lquote = uadv(ret.rquote);
 405         ret.rquote = uadv(ret.lquote);
 406     }
 407
 408     return ret;
 409 }
 410
 411 paragraph *html_config_filename(char *filename)
 412 {
 413     /*
 414      * If the user passes in a single filename as a parameter to
 415      * the `--html' command-line option, then we should assume it
 416      * to imply _two_ config directives:
 417      * \cfg{html-single-filename}{whatever} and
 418      * \cfg{html-leaf-level}{0}; the rationale being that the user
 419      * wants their output _in that file_.
 420      */
 421     paragraph *p, *q;
 422
 423     p = cmdline_cfg_simple("html-single-filename", filename, NULL);
 424     q = cmdline_cfg_simple("html-leaf-level", "0", NULL);
 425     p->next = q;
 426     return p;
 427 }
 428
 429 void html_backend(paragraph *sourceform, keywordlist *keywords,
 430                   indexdata *idx, void *unused) {
 431     paragraph *p;
 432     htmlconfig conf;
 433     htmlfilelist files = { NULL, NULL, NULL, NULL, NULL };
 434     htmlsectlist sects = { NULL, NULL }, nonsects = { NULL, NULL };
 435
 436     IGNORE(unused);
 437
 438     conf = html_configure(sourceform);
 439
 440     /*
 441      * We're going to make heavy use of paragraphs' private data
 442      * fields in the forthcoming code. Clear them first, so we can
 443      * reliably tell whether we have auxiliary data for a
 444      * particular paragraph.
 445      */
 446     for (p = sourceform; p; p = p->next)
 447         p->private_data = NULL;
 448
 449     files.frags = newtree234(html_fragment_compare);
 450
 451     /*
 452      * Start by figuring out into which file each piece of the
 453      * document should be put. We'll do this by inventing an
 454      * `htmlsect' structure and stashing it in the private_data
 455      * field of each section paragraph; we also need one additional
 456      * htmlsect for the document index, which won't show up in the
 457      * source form but needs to be consistently mentioned in
 458      * contents links.
 459      *
 460      * While we're here, we'll also invent the HTML fragment name
 461      * for each section.
 462      */
 463     {
 464         htmlsect *topsect, *sect;
 465         int d;
 466
 467         topsect = html_new_sect(&sects, p);
 468         topsect->type = TOP;
 469         topsect->title = NULL;
 470         topsect->text = sourceform;
 471         topsect->contents_depth = contents_depth(conf, 0);
 472         html_file_section(&conf, &files, topsect, -1);
 473         topsect->fragment = NULL;
 474
 475         for (p = sourceform; p; p = p->next)
 476             if (is_heading_type(p->type)) {
 477                 d = heading_depth(p);
 478
 479                 if (p->type == para_Title) {
 480                     topsect->title = p;
 481                     continue;
 482                 }
 483
 484                 sect = html_new_sect(&sects, p);
 485                 sect->text = p->next;
 486
 487                 sect->contents_depth = contents_depth(conf, d+1) - (d+1);
 488
 489                 if (p->parent) {
 490                     sect->parent = (htmlsect *)p->parent->private_data;
 491                     assert(sect->parent != NULL);
 492                 } else
 493                     sect->parent = topsect;
 494                 p->private_data = sect;
 495
 496                 html_file_section(&conf, &files, sect, d);
 497
 498                 sect->fragment = html_format(p, conf.template_fragment);
 499                 sect->fragment = html_sanitise_fragment(&files, sect->file,
 500                                                         sect->fragment);
 501             }
 502
 503         /* And the index. */
 504         sect = html_new_sect(&sects, NULL);
 505         sect->text = NULL;
 506         sect->type = INDEX;
 507         sect->parent = topsect;
 508         html_file_section(&conf, &files, sect, 0);   /* peer of chapters */
 509         sect->fragment = dupstr("Index");   /* FIXME: this _can't_ be right */
 510         sect->fragment = html_sanitise_fragment(&files, sect->file,
 511                                                 sect->fragment);
 512         files.index = sect->file;
 513     }
 514
 515     /*
 516      * Go through the keyword list and sort out fragment IDs for
 517      * all the potentially referenced paragraphs which _aren't_
 518      * headings.
 519      */
 520     {
 521         int i;
 522         keyword *kw;
 523         htmlsect *sect;
 524
 525         for (i = 0; (kw = index234(keywords->keys, i)) != NULL; i++) {
 526             paragraph *q, *p = kw->para;
 527
 528             if (!is_heading_type(p->type)) {
 529                 htmlsect *parent;
 530
 531                 /*
 532                  * Find the paragraph's parent htmlsect, to
 533                  * determine which file it will end up in.
 534                  */
 535                 q = p->parent;
 536                 if (!q) {
 537                     /*
 538                      * Preamble paragraphs have no parent. So if we
 539                      * have a non-heading with no parent, it must
 540                      * be preamble, and therefore its parent
 541                      * htmlsect must be the preamble one.
 542                      */
 543                     assert(sects.head &&
 544                            sects.head->type == TOP);
 545                     parent = sects.head;
 546                 } else
 547                     parent = (htmlsect *)q->private_data;
 548
 549                 /*
 550                  * Now we can construct an htmlsect for this
 551                  * paragraph itself, taking care to put it in the
 552                  * list of non-sections rather than the list of
 553                  * sections (so that traverses of the `sects' list
 554                  * won't attempt to add it to the contents or
 555                  * anything weird like that).
 556                  */
 557                 sect = html_new_sect(&nonsects, p);
 558                 sect->file = parent->file;
 559                 sect->parent = parent;
 560                 p->private_data = sect;
 561
 562                 /*
 563                  * FIXME: We need a much better means of naming
 564                  * these, possibly involving an additional
 565                  * configuration template. For the moment I'll just
 566                  * invent something completely stupid.
 567                  */
 568                 sect->fragment = snewn(40, char);
 569                 sprintf(sect->fragment, "frag%p", sect);
 570                 sect->fragment = html_sanitise_fragment(&files, sect->file,
 571                                                         sect->fragment);
 572             }
 573         }
 574     }
 575
 576     /*
 577      * Now sort out the index. This involves:
 578      *
 579      *  - For each index term, we set up an htmlindex structure to
 580      *    store all the references to that term.
 581      *
 582      *  - Then we make a pass over the actual document, finding
 583      *    every word_IndexRef; for each one, we actually figure out
 584      *    the HTML filename/fragment pair we will use to reference
 585      *    it, store that information in the private data field of
 586      *    the word_IndexRef itself (so we can recreate it when the
 587      *    time comes to output our HTML), and add a reference to it
 588      *    to the index term in question.
 589      */
 590     {
 591         int i;
 592         indexentry *entry;
 593         htmlsect *lastsect;
 594         word *w;
 595
 596         /*
 597          * Set up the htmlindex structures.
 598          */
 599
 600         for (i = 0; (entry = index234(idx->entries, i)) != NULL; i++) {
 601             htmlindex *hi = snew(htmlindex);
 602
 603             hi->nrefs = hi->refsize = 0;
 604             hi->refs = NULL;
 605
 606             entry->backend_data = hi;
 607         }
 608
 609         /*
 610          * Run over the document inventing fragments. Each fragment
 611          * is of the form `i' followed by an integer.
 612          */
 613         lastsect = NULL;
 614         for (p = sourceform; p; p = p->next) {
 615             if (is_heading_type(p->type))
 616                 lastsect = (htmlsect *)p->private_data;
 617
 618             for (w = p->words; w; w = w->next)
 619                 if (w->type == word_IndexRef) {
 620                     htmlindexref *hr = snew(htmlindexref);
 621                     indextag *tag;
 622                     int i;
 623
 624                     hr->section = lastsect;
 625                     {
 626                         char buf[40];
 627                         sprintf(buf, "i%d",
 628                                 lastsect->file->last_fragment_number++);
 629                         hr->fragment = dupstr(buf);
 630                         hr->fragment =
 631                             html_sanitise_fragment(&files, hr->section->file,
 632                                                    hr->fragment);
 633                     }
 634                     w->private_data = hr;
 635
 636                     tag = index_findtag(idx, w->text);
 637                     if (!tag)
 638                         break;
 639
 640                     for (i = 0; i < tag->nrefs; i++) {
 641                         indexentry *entry = tag->refs[i];
 642                         htmlindex *hi = (htmlindex *)entry->backend_data;
 643
 644                         if (hi->nrefs >= hi->refsize) {
 645                             hi->refsize += 32;
 646                             hi->refs = sresize(hi->refs, hi->refsize, word *);
 647                         }
 648
 649                         hi->refs[hi->nrefs++] = w;
 650                     }
 651                 }
 652         }
 653     }
 654
 655     /*
 656      * Now we're ready to write out the actual HTML files.
 657      *
 658      * For each file:
 659      *
 660      *  - we open that file and write its header
 661      *  - we run down the list of sections
 662      *  - for each section directly contained within that file, we
 663      *    output the section text
 664      *  - for each section which is not in the file but which has a
 665      *    parent that is, we output a contents entry for the
 666      *    section if appropriate
 667      *  - finally, we output the file trailer and close the file.
 668      */
 669     {
 670         htmlfile *f, *prevf;
 671         htmlsect *s;
 672         paragraph *p;
 673
 674         prevf = NULL;
 675
 676         for (f = files.head; f; f = f->next) {
 677             htmloutput ho;
 678             int displaying;
 679             enum LISTTYPE { NOLIST, UL, OL, DL };
 680             enum ITEMTYPE { NOITEM, LI, DT, DD };
 681             struct stackelement {
 682                 struct stackelement *next;
 683                 enum LISTTYPE listtype;
 684                 enum ITEMTYPE itemtype;
 685             } *stackhead;
 686
 687 #define listname(lt) ( (lt)==UL ? "ul" : (lt)==OL ? "ol" : "dl" )
 688 #define itemname(lt) ( (lt)==LI ? "li" : (lt)==DT ? "dt" : "dd" )
 689
 690             ho.fp = fopen(f->filename, "w");
 691             ho.charset = conf.output_charset;
 692             ho.cstate = charset_init_state;
 693             ho.ver = conf.htmlver;
 694             ho.state = HO_NEUTRAL;
 695             ho.contents_level = 0;
 696
 697             /* <!DOCTYPE>. */
 698             switch (conf.htmlver) {
 699               case HTML_3_2:
 700                 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD "
 701                         "HTML 3.2 Final//EN\">\n");
 702                 break;
 703               case HTML_4:
 704                 fprintf(ho.fp, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML"
 705                         " 4.01//EN\"\n\"http://www.w3.org/TR/html4/"
 706                         "strict.dtd\">\n");
 707                 break;
 708               case XHTML_1_0_TRANSITIONAL:
 709                 /* FIXME: <?xml?> to specify character encoding.
 710                  * This breaks HTML backwards compat, so perhaps avoid, or
 711                  * perhaps only emit when not using the default UTF-8? */
 712                 fprintf(ho.fp, "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML"
 713                         " 1.0 Transitional//EN\"\n\"http://www.w3.org/TR/"
 714                         "xhtml1/DTD/xhtml1-transitional.dtd\">\n");
 715                 break;
 716               case XHTML_1_0_STRICT:
 717                 /* FIXME: <?xml?> to specify character encoding. */
 718                 fprintf(ho.fp, "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML"
 719                         " 1.0 Strict//EN\"\n\"http://www.w3.org/TR/xhtml1/"
 720                         "DTD/xhtml1-strict.dtd\">\n");
 721                 break;
 722             }
 723
 724             element_open(&ho, "html");
 725             if (is_xhtml(conf.htmlver)) {
 726                 element_attr(&ho, "xmlns", "http://www.w3.org/1999/xhtml");
 727             }
 728             html_nl(&ho);
 729
 730             element_open(&ho, "head");
 731             html_nl(&ho);
 732
 733             element_empty(&ho, "meta");
 734             element_attr(&ho, "http-equiv", "content-type");
 735             {
 736                 char buf[200];
 737                 sprintf(buf, "text/html; charset=%.150s",
 738                         charset_to_mimeenc(conf.output_charset));
 739                 element_attr(&ho, "content", buf);
 740             }
 741             html_nl(&ho);
 742
 743             if (conf.author) {
 744                 element_empty(&ho, "meta");
 745                 element_attr(&ho, "name", "author");
 746                 element_attr_w(&ho, "content", conf.author);
 747                 html_nl(&ho);
 748             }
 749
 750             if (conf.description) {
 751                 element_empty(&ho, "meta");
 752                 element_attr(&ho, "name", "description");
 753                 element_attr_w(&ho, "content", conf.description);
 754                 html_nl(&ho);
 755             }
 756
 757             element_open(&ho, "title");
 758             if (f->first && f->first->title) {
 759                 html_words(&ho, f->first->title->words, NOTHING,
 760                            f, keywords, &conf);
 761
 762                 assert(f->last);
 763                 if (f->last != f->first && f->last->title) {
 764                     html_text(&ho, L" - ");   /* FIXME: configurable? */
 765                     html_words(&ho, f->last->title->words, NOTHING,
 766                                f, keywords, &conf);
 767                 }
 768             }
 769             element_close(&ho, "title");
 770             html_nl(&ho);
 771
 772             if (conf.head_end)
 773                 html_raw(&ho, conf.head_end);
 774
 775             element_close(&ho, "head");
 776             html_nl(&ho);
 777
 778             /* FIXME: need to be able to specify replacement for this */
 779             if (conf.body_tag)
 780                 html_raw(&ho, conf.body_tag);
 781             else
 782                 element_open(&ho, "body");
 783             html_nl(&ho);
 784
 785             if (conf.body_start)
 786                 html_raw(&ho, conf.body_start);
 787
 788             /*
 789              * Write out a nav bar. Special case: we don't do this
 790              * if there is only one file.
 791              */
 792             if (files.head != files.tail) {
 793                 element_open(&ho, "p");
 794                 if (conf.nav_attr)
 795                     html_raw_as_attr(&ho, conf.nav_attr);
 796
 797                 if (prevf) {
 798                     element_open(&ho, "a");
 799                     element_attr(&ho, "href", prevf->filename);
 800                 }
 801                 html_text(&ho, L"Previous");/* FIXME: conf? */
 802                 if (prevf)
 803                     element_close(&ho, "a");
 804
 805                 html_text(&ho, L" | ");     /* FIXME: conf? */
 806
 807                 if (f != files.head) {
 808                     element_open(&ho, "a");
 809                     element_attr(&ho, "href", files.head->filename);
 810                 }
 811                 html_text(&ho, L"Contents");/* FIXME: conf? */
 812                 if (f != files.head)
 813                     element_close(&ho, "a");
 814
 815                 html_text(&ho, L" | ");     /* FIXME: conf? */
 816
 817                 if (f != files.index) {
 818                     element_open(&ho, "a");
 819                     element_attr(&ho, "href", files.index->filename);
 820                 }
 821                 html_text(&ho, L"Index");/* FIXME: conf? */
 822                 if (f != files.index)
 823                     element_close(&ho, "a");
 824
 825                 html_text(&ho, L" | ");     /* FIXME: conf? */
 826
 827                 if (f->next) {
 828                     element_open(&ho, "a");
 829                     element_attr(&ho, "href", f->next->filename);
 830                 }
 831                 html_text(&ho, L"Next");    /* FIXME: conf? */
 832                 if (f->next)
 833                     element_close(&ho, "a");
 834
 835                 element_close(&ho, "p");
 836                 html_nl(&ho);
 837             }
 838             prevf = f;
 839
 840             /*
 841              * Write out a prefix TOC for the file.
 842              *
 843              * We start by going through the section list and
 844              * collecting the sections which need to be added to
 845              * the contents. On the way, we also test to see if
 846              * this file is a leaf file (defined as one which
 847              * contains all descendants of any section it
 848              * contains), because this will play a part in our
 849              * decision on whether or not to _output_ the TOC.
 850              *
 851              * Special case: we absolutely do not do this if we're
 852              * in single-file mode.
 853              */
 854             if (files.head != files.tail) {
 855                 int ntoc = 0, tocsize = 0;
 856                 htmlsect **toc = NULL;
 857                 int leaf = TRUE;
 858
 859                 for (s = sects.head; s; s = s->next) {
 860                     htmlsect *a, *ac;
 861                     int depth, adepth;
 862
 863                     /*
 864                      * Search up from this section until we find
 865                      * the highest-level one which belongs in this
 866                      * file.
 867                      */
 868                     depth = adepth = 0;
 869                     a = NULL;
 870                     for (ac = s; ac; ac = ac->parent) {
 871                         if (ac->file == f) {
 872                             a = ac;
 873                             adepth = depth;
 874                         }
 875                         depth++;
 876                     }
 877
 878                     if (s->file != f && a != NULL)
 879                         leaf = FALSE;
 880
 881                     if (a) {
 882                         if (adepth <= a->contents_depth) {
 883                             if (ntoc >= tocsize) {
 884                                 tocsize += 64;
 885                                 toc = sresize(toc, tocsize, htmlsect *);
 886                             }
 887                             toc[ntoc++] = s;
 888                         }
 889                     }
 890                 }
 891
 892                 if (leaf && conf.leaf_contains_contents &&
 893                     ntoc >= conf.leaf_smallest_contents) {
 894                     int i;
 895
 896                     for (i = 0; i < ntoc; i++) {
 897                         htmlsect *s = toc[i];
 898                         int hlevel = (s->type == TOP ? -1 :
 899                                       s->type == INDEX ? 0 :
 900                                       heading_depth(s->title))
 901                             - f->min_heading_depth + 1;
 902
 903                         assert(hlevel >= 1);
 904                         html_contents_entry(&ho, hlevel, s,
 905                                             f, keywords, &conf);
 906                     }
 907                     html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
 908                 }
 909             }
 910
 911             /*
 912              * Now go through the document and output some real
 913              * text.
 914              */
 915             displaying = FALSE;
 916             for (s = sects.head; s; s = s->next) {
 917                 if (s->file == f) {
 918                     /*
 919                      * This section belongs in this file.
 920                      * Display it.
 921                      */
 922                     displaying = TRUE;
 923                 } else {
 924                     htmlsect *a, *ac;
 925                     int depth, adepth;
 926
 927                     displaying = FALSE;
 928
 929                     /*
 930                      * Search up from this section until we find
 931                      * the highest-level one which belongs in this
 932                      * file.
 933                      */
 934                     depth = adepth = 0;
 935                     a = NULL;
 936                     for (ac = s; ac; ac = ac->parent) {
 937                         if (ac->file == f) {
 938                             a = ac;
 939                             adepth = depth;
 940                         }
 941                         depth++;
 942                     }
 943
 944                     if (a != NULL) {
 945                         /*
 946                          * This section does not belong in this
 947                          * file, but an ancestor of it does. Write
 948                          * out a contents table entry, if the depth
 949                          * doesn't exceed the maximum contents
 950                          * depth for the ancestor section.
 951                          */
 952                         if (adepth <= a->contents_depth) {
 953                             html_contents_entry(&ho, adepth, s,
 954                                                 f, keywords, &conf);
 955                         }
 956                     }
 957                 }
 958
 959                 if (displaying) {
 960                     int hlevel;
 961                     char htag[3];
 962
 963                     html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
 964
 965                     /*
 966                      * Display the section heading.
 967                      */
 968
 969                     hlevel = (s->type == TOP ? -1 :
 970                               s->type == INDEX ? 0 :
 971                               heading_depth(s->title))
 972                         - f->min_heading_depth + 1;
 973                     assert(hlevel >= 1);
 974                     /* HTML headings only go up to <h6> */
 975                     if (hlevel > 6)
 976                         hlevel = 6;
 977                     htag[0] = 'h';
 978                     htag[1] = '0' + hlevel;
 979                     htag[2] = '\0';
 980                     element_open(&ho, htag);
 981
 982                     /*
 983                      * Provide anchor for cross-links to target.
 984                      *
 985                      * FIXME: AIcurrentlyUI, this needs to be done
 986                      * differently in XHTML because <a name> is
 987                      * deprecated or obsolete.
 988                      *
 989                      * (Also we'll have to do this separately in
 990                      * other paragraph types - NumberedList and
 991                      * BiblioCited.)
 992                      */
 993                     element_open(&ho, "a");
 994                     element_attr(&ho, "name", s->fragment);
 995                     element_close(&ho, "a");
 996
 997                     html_section_title(&ho, s, f, keywords, &conf, TRUE);
 998
 999                     element_close(&ho, htag);
1000
1001                     /*
1002                      * Now display the section text.
1003                      */
1004                     if (s->text) {
1005                         stackhead = snew(struct stackelement);
1006                         stackhead->next = NULL;
1007                         stackhead->listtype = NOLIST;
1008                         stackhead->itemtype = NOITEM;
1009
1010                         for (p = s->text;; p = p->next) {
1011                             enum LISTTYPE listtype;
1012                             struct stackelement *se;
1013
1014                             /*
1015                              * Preliminary switch to figure out what
1016                              * sort of list we expect to be inside at
1017                              * this stage.
1018                              *
1019                              * Since p may still be NULL at this point,
1020                              * I invent a harmless paragraph type for
1021                              * it if it is.
1022                              */
1023                             switch (p ? p->type : para_Normal) {
1024                               case para_Rule:
1025                               case para_Normal:
1026                               case para_Copyright:
1027                               case para_BiblioCited:
1028                               case para_Code:
1029                               case para_QuotePush:
1030                               case para_QuotePop:
1031                               case para_Chapter:
1032                               case para_Appendix:
1033                               case para_UnnumberedChapter:
1034                               case para_Heading:
1035                               case para_Subsect:
1036                               case para_LcontPop:
1037                                 listtype = NOLIST;
1038                                 break;
1039
1040                               case para_Bullet:
1041                                 listtype = UL;
1042                                 break;
1043
1044                               case para_NumberedList:
1045                                 listtype = OL;
1046                                 break;
1047
1048                               case para_DescribedThing:
1049                               case para_Description:
1050                                 listtype = DL;
1051                                 break;
1052
1053                               case para_LcontPush:
1054                                 se = snew(struct stackelement);
1055                                 se->next = stackhead;
1056                                 se->listtype = NOLIST;
1057                                 se->itemtype = NOITEM;
1058                                 stackhead = se;
1059                                 continue;
1060
1061                               default:     /* some totally non-printing para */
1062                                 continue;
1063                             }
1064
1065                             html_nl(&ho);
1066
1067                             /*
1068                              * Terminate the most recent list item, if
1069                              * any. (We left this until after
1070                              * processing LcontPush, since in that case
1071                              * the list item won't want to be
1072                              * terminated until after the corresponding
1073                              * LcontPop.)
1074                              */
1075                             if (stackhead->itemtype != NOITEM) {
1076                                 element_close(&ho, itemname(stackhead->itemtype));
1077                                 html_nl(&ho);
1078                             }
1079                             stackhead->itemtype = NOITEM;
1080
1081                             /*
1082                              * Terminate the current list, if it's not
1083                              * the one we want to be in.
1084                              */
1085                             if (listtype != stackhead->listtype &&
1086                                 stackhead->listtype != NOLIST) {
1087                                 element_close(&ho, listname(stackhead->listtype));
1088                                 html_nl(&ho);
1089                             }
1090
1091                             /*
1092                              * Leave the loop if our time has come.
1093                              */
1094                             if (!p || (is_heading_type(p->type) &&
1095                                        p->type != para_Title))
1096                                 break;     /* end of section text */
1097
1098                             /*
1099                              * Start a fresh list if necessary.
1100                              */
1101                             if (listtype != stackhead->listtype &&
1102                                 listtype != NOLIST)
1103                                 element_open(&ho, listname(listtype));
1104
1105                             stackhead->listtype = listtype;
1106
1107                             switch (p->type) {
1108                               case para_Rule:
1109                                 element_empty(&ho, "hr");
1110                                 break;
1111                               case para_Code:
1112                                 html_codepara(&ho, p->words);
1113                                 break;
1114                               case para_Normal:
1115                               case para_Copyright:
1116                                 element_open(&ho, "p");
1117                                 html_nl(&ho);
1118                                 html_words(&ho, p->words, ALL,
1119                                            f, keywords, &conf);
1120                                 html_nl(&ho);
1121                                 element_close(&ho, "p");
1122                                 break;
1123                               case para_BiblioCited:
1124                                 element_open(&ho, "p");
1125                                 if (p->private_data) {
1126                                     htmlsect *s = (htmlsect *)p->private_data;
1127                                     element_open(&ho, "a");
1128                                     element_attr(&ho, "name", s->fragment);
1129                                     element_close(&ho, "a");
1130                                 }
1131                                 html_nl(&ho);
1132                                 html_words(&ho, p->kwtext, ALL,
1133                                            f, keywords, &conf);
1134                                 html_text(&ho, L" ");
1135                                 html_words(&ho, p->words, ALL,
1136                                            f, keywords, &conf);
1137                                 html_nl(&ho);
1138                                 element_close(&ho, "p");
1139                                 break;
1140                               case para_Bullet:
1141                               case para_NumberedList:
1142                                 element_open(&ho, "li");
1143                                 if (p->private_data) {
1144                                     htmlsect *s = (htmlsect *)p->private_data;
1145                                     element_open(&ho, "a");
1146                                     element_attr(&ho, "name", s->fragment);
1147                                     element_close(&ho, "a");
1148                                 }
1149                                 html_nl(&ho);
1150                                 stackhead->itemtype = LI;
1151                                 html_words(&ho, p->words, ALL,
1152                                            f, keywords, &conf);
1153                                 break;
1154                               case para_DescribedThing:
1155                                 element_open(&ho, "dt");
1156                                 html_nl(&ho);
1157                                 stackhead->itemtype = DT;
1158                                 html_words(&ho, p->words, ALL,
1159                                            f, keywords, &conf);
1160                                 break;
1161                               case para_Description:
1162                                 element_open(&ho, "dd");
1163                                 html_nl(&ho);
1164                                 stackhead->itemtype = DD;
1165                                 html_words(&ho, p->words, ALL,
1166                                            f, keywords, &conf);
1167                                 break;
1168
1169                               case para_QuotePush:
1170                                 element_open(&ho, "blockquote");
1171                                 break;
1172                               case para_QuotePop:
1173                                 element_close(&ho, "blockquote");
1174                                 break;
1175
1176                               case para_LcontPop:
1177                                 se = stackhead;
1178                                 stackhead = stackhead->next;
1179                                 assert(stackhead);
1180                                 sfree(se);
1181                                 break;
1182                             }
1183                         }
1184
1185                         assert(stackhead && !stackhead->next);
1186                         sfree(stackhead);
1187                     }
1188
1189                     if (s->type == INDEX) {
1190                         indexentry *entry;
1191                         int i;
1192
1193                         /*
1194                          * This section is the index. I'll just
1195                          * render it as a single paragraph, with a
1196                          * colon between the index term and the
1197                          * references, and <br> in between each
1198                          * entry.
1199                          */
1200                         element_open(&ho, "p");
1201
1202                         for (i = 0; (entry =
1203                                      index234(idx->entries, i)) != NULL; i++) {
1204                             htmlindex *hi = (htmlindex *)entry->backend_data;
1205                             int j;
1206
1207                             if (i > 0)
1208                                 element_empty(&ho, "br");
1209                             html_nl(&ho);
1210
1211                             html_words(&ho, entry->text, MARKUP|LINKS,
1212                                        f, keywords, &conf);
1213
1214                             html_text(&ho, L": ");/* FIXME: configurable */
1215
1216                             for (j = 0; j < hi->nrefs; j++) {
1217                                 htmlindexref *hr =
1218                                     (htmlindexref *)hi->refs[j]->private_data;
1219                                 paragraph *p = hr->section->title;
1220
1221                                 if (j > 0)
1222                                     html_text(&ho, L", "); /* FIXME: conf */
1223
1224                                 html_href(&ho, f, hr->section->file,
1225                                           hr->fragment);
1226                                 if (p && p->kwtext)
1227                                     html_words(&ho, p->kwtext, MARKUP|LINKS,
1228                                                f, keywords, &conf);
1229                                 else if (p && p->words)
1230                                     html_words(&ho, p->words, MARKUP|LINKS,
1231                                                f, keywords, &conf);
1232                                 else
1233                                     html_text(&ho, L"FIXME");
1234                                 element_close(&ho, "a");
1235                             }
1236                         }
1237                         element_close(&ho, "p");
1238                     }
1239                 }
1240             }
1241
1242             html_contents_entry(&ho, 0, NULL, f, keywords, &conf);
1243             html_nl(&ho);
1244
1245             {
1246                 /*
1247                  * Footer.
1248                  */
1249                 int done_version_ids = FALSE;
1250
1251                 element_empty(&ho, "hr");
1252
1253                 if (conf.body_end)
1254                     html_raw(&ho, conf.body_end);
1255
1256                 if (conf.address_section) {
1257                     element_open(&ho, "address");
1258                     if (conf.addr_start) {
1259                         html_raw(&ho, conf.addr_start);
1260                         html_nl(&ho);
1261                     }
1262                     if (conf.visible_version_id) {
1263                         int started = FALSE;
1264                         for (p = sourceform; p; p = p->next)
1265                             if (p->type == para_VersionID) {
1266                                 if (!started)
1267                                     element_open(&ho, "p");
1268                                 else
1269                                     element_empty(&ho, "br");
1270                                 html_nl(&ho);
1271                                 html_text(&ho, L"[");   /* FIXME: conf? */
1272                                 html_words(&ho, p->words, NOTHING,
1273                                            f, keywords, &conf);
1274                                 html_text(&ho, L"]");   /* FIXME: conf? */
1275                                 started = TRUE;
1276                             }
1277                         if (started)
1278                             element_close(&ho, "p");
1279                         done_version_ids = TRUE;
1280                     }
1281                     if (conf.addr_end)
1282                         html_raw(&ho, conf.addr_end);
1283                     element_close(&ho, "address");
1284                 }
1285
1286                 if (!done_version_ids) {
1287                     /*
1288                      * If the user didn't want the version IDs
1289                      * visible, I think we still have a duty to put
1290                      * them in an HTML comment.
1291                      */
1292                     int started = FALSE;
1293                     for (p = sourceform; p; p = p->next)
1294                         if (p->type == para_VersionID) {
1295                             if (!started) {
1296                                 html_raw(&ho, "<!-- version IDs:\n");
1297                                 started = TRUE;
1298                             }
1299                             html_words(&ho, p->words, NOTHING,
1300                                        f, keywords, &conf);
1301                             html_nl(&ho);
1302                         }
1303                     if (started)
1304                         html_raw(&ho, "-->\n");
1305                 }
1306             }
1307
1308             element_close(&ho, "body");
1309             html_nl(&ho);
1310             element_close(&ho, "html");
1311             html_nl(&ho);
1312             cleanup(&ho);
1313         }
1314     }
1315
1316     /*
1317      * FIXME: Free all the working data.
1318      */
1319 }
1320
1321 static void html_file_section(htmlconfig *cfg, htmlfilelist *files,
1322                               htmlsect *sect, int depth)
1323 {
1324     htmlfile *file;
1325     int ldepth;
1326
1327     /*
1328      * `depth' is derived from the heading_depth() macro at the top
1329      * of this file, which counts title as -1, chapter as 0,
1330      * heading as 1 and subsection as 2. However, the semantics of
1331      * cfg->leaf_level are defined to count chapter as 1, heading
1332      * as 2 etc. So first I increment depth :-(
1333      */
1334     ldepth = depth + 1;
1335
1336     if (cfg->leaf_level == 0) {
1337         /*
1338          * leaf_level==0 is a special case, in which everything is
1339          * put into a single file.
1340          */
1341         if (!files->single)
1342             files->single = html_new_file(files, cfg->single_filename);
1343
1344         file = files->single;
1345     } else {
1346         /*
1347          * If the depth of this section is at or above leaf_level,
1348          * we invent a fresh file and put this section at its head.
1349          * Otherwise, we put it in the same file as its parent
1350          * section.
1351          */
1352         if (ldepth > cfg->leaf_level) {
1353             /*
1354              * We know that sect->parent cannot be NULL. The only
1355              * circumstance in which it can be is if sect is at
1356              * chapter or appendix level, i.e. ldepth==1; and if
1357              * that's the case, then we cannot have entered this
1358              * branch unless cfg->leaf_level==0, in which case we
1359              * would be in the single-file case above and not here
1360              * at all.
1361              */
1362             assert(sect->parent);
1363
1364             file = sect->parent->file;
1365         } else {
1366             if (sect->type == TOP) {
1367                 file = html_new_file(files, cfg->contents_filename);
1368             } else if (sect->type == INDEX) {
1369                 file = html_new_file(files, cfg->index_filename);
1370             } else {
1371                 char *title;
1372
1373                 assert(ldepth > 0 && sect->title);
1374                 title = html_format(sect->title, cfg->template_filename);
1375                 file = html_new_file(files, title);
1376                 sfree(title);
1377             }
1378         }
1379     }
1380
1381     sect->file = file;
1382
1383     if (file->min_heading_depth > depth) {
1384         /*
1385          * This heading is at a higher level than any heading we
1386          * have so far placed in this file; so we set the `first'
1387          * pointer.
1388          */
1389         file->min_heading_depth = depth;
1390         file->first = sect;
1391     }
1392
1393     if (file->min_heading_depth == depth)
1394         file->last = sect;
1395 }
1396
1397 static htmlfile *html_new_file(htmlfilelist *list, char *filename)
1398 {
1399     htmlfile *ret = snew(htmlfile);
1400
1401     ret->next = NULL;
1402     if (list->tail)
1403         list->tail->next = ret;
1404     else
1405         list->head = ret;
1406     list->tail = ret;
1407
1408     ret->filename = dupstr(filename);
1409     ret->last_fragment_number = 0;
1410     ret->min_heading_depth = INT_MAX;
1411     ret->first = ret->last = NULL;
1412
1413     return ret;
1414 }
1415
1416 static htmlsect *html_new_sect(htmlsectlist *list, paragraph *title)
1417 {
1418     htmlsect *ret = snew(htmlsect);
1419
1420     ret->next = NULL;
1421     if (list->tail)
1422         list->tail->next = ret;
1423     else
1424         list->head = ret;
1425     list->tail = ret;
1426
1427     ret->title = title;
1428     ret->file = NULL;
1429     ret->parent = NULL;
1430     ret->type = NORMAL;
1431
1432     return ret;
1433 }
1434
1435 static void html_words(htmloutput *ho, word *words, int flags,
1436                        htmlfile *file, keywordlist *keywords, htmlconfig *cfg)
1437 {
1438     word *w;
1439     char *c;
1440     int style, type;
1441
1442     for (w = words; w; w = w->next) switch (w->type) {
1443       case word_HyperLink:
1444         if (flags & LINKS) {
1445             element_open(ho, "a");
1446             c = utoa_dup(w->text, CS_ASCII);
1447             element_attr(ho, "href", c);
1448             sfree(c);
1449         }
1450         break;
1451       case word_UpperXref:
1452       case word_LowerXref:
1453         if (flags & LINKS) {
1454             keyword *kwl = kw_lookup(keywords, w->text);
1455             paragraph *p = kwl->para;
1456             htmlsect *s = (htmlsect *)p->private_data;
1457
1458             assert(s);
1459
1460             html_href(ho, file, s->file, s->fragment);
1461         }
1462         break;
1463       case word_HyperEnd:
1464       case word_XrefEnd:
1465         if (flags & LINKS)
1466             element_close(ho, "a");
1467         break;
1468       case word_IndexRef:
1469         if (flags & INDEXENTS) {
1470             htmlindexref *hr = (htmlindexref *)w->private_data;
1471             element_open(ho, "a");
1472             element_attr(ho, "name", hr->fragment);
1473             element_close(ho, "a");
1474         }
1475         break;
1476       case word_Normal:
1477       case word_Emph:
1478       case word_Code:
1479       case word_WeakCode:
1480       case word_WhiteSpace:
1481       case word_EmphSpace:
1482       case word_CodeSpace:
1483       case word_WkCodeSpace:
1484       case word_Quote:
1485       case word_EmphQuote:
1486       case word_CodeQuote:
1487       case word_WkCodeQuote:
1488         style = towordstyle(w->type);
1489         type = removeattr(w->type);
1490         if (style == word_Emph &&
1491             (attraux(w->aux) == attr_First ||
1492              attraux(w->aux) == attr_Only) &&
1493             (flags & MARKUP))
1494             element_open(ho, "em");
1495         else if ((style == word_Code || style == word_WeakCode) &&
1496                  (attraux(w->aux) == attr_First ||
1497                   attraux(w->aux) == attr_Only) &&
1498                  (flags & MARKUP))
1499             element_open(ho, "code");
1500
1501         if (type == word_WhiteSpace)
1502             html_text(ho, L" ");
1503         else if (type == word_Quote) {
1504             if (quoteaux(w->aux) == quote_Open)
1505                 html_text(ho, cfg->lquote);
1506             else
1507                 html_text(ho, cfg->rquote);
1508         } else {
1509             if (cvt_ok(ho->charset, w->text) || !w->alt)
1510                 html_text(ho, w->text);
1511             else
1512                 html_words(ho, w->alt, flags, file, keywords, cfg);
1513         }
1514
1515         if (style == word_Emph &&
1516             (attraux(w->aux) == attr_Last ||
1517              attraux(w->aux) == attr_Only) &&
1518             (flags & MARKUP))
1519             element_close(ho, "em");
1520         else if ((style == word_Code || style == word_WeakCode) &&
1521                  (attraux(w->aux) == attr_Last ||
1522                   attraux(w->aux) == attr_Only) &&
1523                  (flags & MARKUP))
1524             element_close(ho, "code");
1525
1526         break;
1527     }
1528 }
1529
1530 static void html_codepara(htmloutput *ho, word *words)
1531 {
1532     element_open(ho, "pre");
1533     element_open(ho, "code");
1534     for (; words; words = words->next) if (words->type == word_WeakCode) {
1535         char *open_tag;
1536         wchar_t *t, *e;
1537
1538         t = words->text;
1539         if (words->next && words->next->type == word_Emph) {
1540             e = words->next->text;
1541             words = words->next;
1542         } else
1543             e = NULL;
1544
1545         while (e && *e && *t) {
1546             int n;
1547             int ec = *e;
1548
1549             for (n = 0; t[n] && e[n] && e[n] == ec; n++);
1550
1551             open_tag = NULL;
1552             if (ec == 'i')
1553                 open_tag = "em";
1554             else if (ec == 'b')
1555                 open_tag = "b";
1556             if (open_tag)
1557                 element_open(ho, open_tag);
1558
1559             html_text_limit(ho, t, n);
1560
1561             if (open_tag)
1562                 element_close(ho, open_tag);
1563
1564             t += n;
1565             e += n;
1566         }
1567         html_text(ho, t);
1568         html_nl(ho);
1569     }
1570     element_close(ho, "code");
1571     element_close(ho, "pre");
1572 }
1573
1574 static void html_charset_cleanup(htmloutput *ho)
1575 {
1576     char outbuf[256];
1577     int bytes;
1578
1579     bytes = charset_from_unicode(NULL, NULL, outbuf, lenof(outbuf),
1580                                  ho->charset, &ho->cstate, NULL);
1581     if (bytes > 0)
1582         fwrite(outbuf, 1, bytes, ho->fp);
1583 }
1584
1585 static void return_to_neutral(htmloutput *ho)
1586 {
1587     if (ho->state == HO_IN_TEXT) {
1588         html_charset_cleanup(ho);
1589     } else if (ho->state == HO_IN_EMPTY_TAG && is_xhtml(ho->ver)) {
1590         fprintf(ho->fp, " />");
1591     } else if (ho->state == HO_IN_EMPTY_TAG || ho->state == HO_IN_TAG) {
1592         fprintf(ho->fp, ">");
1593     }
1594
1595     ho->state = HO_NEUTRAL;
1596 }
1597
1598 static void element_open(htmloutput *ho, char const *name)
1599 {
1600     return_to_neutral(ho);
1601     fprintf(ho->fp, "<%s", name);
1602     ho->state = HO_IN_TAG;
1603 }
1604
1605 static void element_close(htmloutput *ho, char const *name)
1606 {
1607     return_to_neutral(ho);
1608     fprintf(ho->fp, "</%s>", name);
1609     ho->state = HO_NEUTRAL;
1610 }
1611
1612 static void element_empty(htmloutput *ho, char const *name)
1613 {
1614     return_to_neutral(ho);
1615     fprintf(ho->fp, "<%s", name);
1616     ho->state = HO_IN_EMPTY_TAG;
1617 }
1618
1619 static void html_nl(htmloutput *ho)
1620 {
1621     return_to_neutral(ho);
1622     fputc('\n', ho->fp);
1623 }
1624
1625 static void html_raw(htmloutput *ho, char *text)
1626 {
1627     return_to_neutral(ho);
1628     fputs(text, ho->fp);
1629 }
1630
1631 static void html_raw_as_attr(htmloutput *ho, char *text)
1632 {
1633     assert(ho->state == HO_IN_TAG || ho->state == HO_IN_EMPTY_TAG);
1634     fputc(' ', ho->fp);
1635     fputs(text, ho->fp);
1636 }
1637
1638 static void element_attr(htmloutput *ho, char const *name, char const *value)
1639 {
1640     html_charset_cleanup(ho);
1641     assert(ho->state == HO_IN_TAG || ho->state == HO_IN_EMPTY_TAG);
1642     fprintf(ho->fp, " %s=\"%s\"", name, value);
1643 }
1644
1645 static void element_attr_w(htmloutput *ho, char const *name,
1646                            wchar_t const *value)
1647 {
1648     html_charset_cleanup(ho);
1649     fprintf(ho->fp, " %s=\"", name);
1650     html_text_limit_internal(ho, value, 0, TRUE);
1651     html_charset_cleanup(ho);
1652     fputc('"', ho->fp);
1653 }
1654
1655 static void html_text(htmloutput *ho, wchar_t const *text)
1656 {
1657     html_text_limit(ho, text, 0);
1658 }
1659
1660 static void html_text_limit(htmloutput *ho, wchar_t const *text, int maxlen)
1661 {
1662     return_to_neutral(ho);
1663     html_text_limit_internal(ho, text, maxlen, FALSE);
1664 }
1665
1666 static void html_text_limit_internal(htmloutput *ho, wchar_t const *text,
1667                                      int maxlen, int quote_quotes)
1668 {
1669     int textlen = ustrlen(text);
1670     char outbuf[256];
1671     int bytes, err;
1672
1673     if (maxlen > 0 && textlen > maxlen)
1674         textlen = maxlen;
1675
1676     while (textlen > 0) {
1677         /* Scan ahead for characters we really can't display in HTML. */
1678         int lenbefore, lenafter;
1679         for (lenbefore = 0; lenbefore < textlen; lenbefore++)
1680             if (text[lenbefore] == L'<' ||
1681                 text[lenbefore] == L'>' ||
1682                 text[lenbefore] == L'&' ||
1683                 (text[lenbefore] == L'"' && quote_quotes))
1684                 break;
1685         lenafter = lenbefore;
1686         bytes = charset_from_unicode(&text, &lenafter, outbuf, lenof(outbuf),
1687                                      ho->charset, &ho->cstate, &err);
1688         textlen -= (lenbefore - lenafter);
1689         if (bytes > 0)
1690             fwrite(outbuf, 1, bytes, ho->fp);
1691         if (err) {
1692             /*
1693              * We have encountered a character that cannot be
1694              * displayed in the selected output charset. Therefore,
1695              * we use an HTML numeric entity reference.
1696              */
1697             assert(textlen > 0);
1698             fprintf(ho->fp, "&#%ld;", (long int)*text);
1699             text++, textlen--;
1700         } else if (lenafter == 0 && textlen > 0) {
1701             /*
1702              * We have encountered a character which is special to
1703              * HTML.
1704              */
1705             if (*text == L'<')
1706                 fprintf(ho->fp, "&lt;");
1707             else if (*text == L'>')
1708                 fprintf(ho->fp, "&gt;");
1709             else if (*text == L'&')
1710                 fprintf(ho->fp, "&amp;");
1711             else if (*text == L'"')
1712                 fprintf(ho->fp, "&quot;");
1713             else
1714                 assert(!"Can't happen");
1715             text++, textlen--;
1716         }
1717     }
1718 }
1719
1720 static void cleanup(htmloutput *ho)
1721 {
1722     return_to_neutral(ho);
1723     fclose(ho->fp);
1724 }
1725
1726 static void html_href(htmloutput *ho, htmlfile *thisfile,
1727                       htmlfile *targetfile, char *targetfrag)
1728 {
1729     rdstringc rs = { 0, 0, NULL };
1730     char *url;
1731
1732     if (targetfile != thisfile)
1733         rdaddsc(&rs, targetfile->filename);
1734     if (targetfrag) {
1735         rdaddc(&rs, '#');
1736         rdaddsc(&rs, targetfrag);
1737     }
1738     url = rs.text;
1739
1740     element_open(ho, "a");
1741     element_attr(ho, "href", url);
1742     sfree(url);
1743 }
1744
1745 static char *html_format(paragraph *p, char *template_string)
1746 {
1747     char *c, *t;
1748     word *w;
1749     wchar_t *ws, wsbuf[2];
1750     rdstringc rs = { 0, 0, NULL };
1751
1752     t = template_string;
1753     while (*t) {
1754         if (*t == '%' && t[1]) {
1755             int fmt;
1756
1757             t++;
1758             fmt = *t++;
1759
1760             if (fmt == '%') {
1761                 rdaddc(&rs, fmt);
1762                 continue;
1763             }
1764
1765             w = NULL;
1766             ws = NULL;
1767
1768             if (p->kwtext && fmt == 'n')
1769                 w = p->kwtext;
1770             else if (p->kwtext2 && fmt == 'b') {
1771                 /*
1772                  * HTML fragment names must start with a letter, so
1773                  * simply `1.2.3' is not adequate. In this case I'm
1774                  * going to cheat slightly by prepending the first
1775                  * character of the first word of kwtext, so that
1776                  * we get `C1' for chapter 1, `S2.3' for section
1777                  * 2.3 etc.
1778                  */
1779                 if (p->kwtext && p->kwtext->text[0]) {
1780                     ws = wsbuf;
1781                     wsbuf[1] = '\0';
1782                     wsbuf[0] = p->kwtext->text[0];
1783                 }
1784                 w = p->kwtext2;
1785             } else if (p->keyword && *p->keyword && fmt == 'k')
1786                 ws = p->keyword;
1787             else
1788                 w = p->words;
1789
1790             if (ws) {
1791                 c = utoa_dup(ws, CS_ASCII);
1792                 rdaddsc(&rs,c);
1793                 sfree(c);
1794             }
1795
1796             while (w) {
1797                 if (removeattr(w->type) == word_Normal) {
1798                     c = utoa_dup(w->text, CS_ASCII);
1799                     rdaddsc(&rs,c);
1800                     sfree(c);
1801                 }
1802                 w = w->next;
1803             }
1804         } else {
1805             rdaddc(&rs, *t++);
1806         }
1807     }
1808
1809     return rdtrimc(&rs);
1810 }
1811
1812 static char *html_sanitise_fragment(htmlfilelist *files, htmlfile *file,
1813                                     char *text)
1814 {
1815     /*
1816      * The HTML 4 spec's strictest definition of fragment names (<a
1817      * name> and "id" attributes) says that they `must begin with a
1818      * letter and may be followed by any number of letters, digits,
1819      * hyphens, underscores, colons, and periods'.
1820      *
1821      * So here we unceremoniously rip out any characters not
1822      * conforming to this limitation.
1823      */
1824     char *p = text, *q = text;
1825
1826     while (*p && !((*p>='A' && *p<='Z') || (*p>='a' && *p<='z')))
1827         p++;
1828     if ((*q++ = *p++) != '\0') {
1829         while (*p) {
1830             if ((*p>='A' && *p<='Z') ||
1831                 (*p>='a' && *p<='z') ||
1832                 (*p>='0' && *p<='9') ||
1833                 *p=='-' || *p=='_' || *p==':' || *p=='.')
1834                 *q++ = *p;
1835             p++;
1836         }
1837
1838         *q = '\0';
1839     }
1840
1841     /*
1842      * Now we check for clashes with other fragment names, and
1843      * adjust this one if necessary by appending a hyphen followed
1844      * by a number.
1845      */
1846     {
1847         htmlfragment *frag = snew(htmlfragment);
1848         int len = 0;                   /* >0 indicates we have resized */
1849         int suffix = 1;
1850
1851         frag->file = file;
1852         frag->fragment = text;
1853
1854         while (add234(files->frags, frag) != frag) {
1855             if (!len) {
1856                 len = strlen(text);
1857                 frag->fragment = text = sresize(text, len+20, char);
1858             }
1859
1860             sprintf(text + len, "-%d", ++suffix);
1861         }
1862     }
1863
1864     return text;
1865 }
1866
1867 static void html_contents_entry(htmloutput *ho, int depth, htmlsect *s,
1868                                 htmlfile *thisfile, keywordlist *keywords,
1869                                 htmlconfig *cfg)
1870 {
1871     while (ho->contents_level > depth) {
1872         element_close(ho, "ul");
1873         ho->contents_level--;
1874     }
1875
1876     while (ho->contents_level < depth) {
1877         element_open(ho, "ul");
1878         ho->contents_level++;
1879     }
1880
1881     if (!s)
1882         return;
1883
1884     element_open(ho, "li");
1885     html_href(ho, thisfile, s->file, s->fragment);
1886     html_section_title(ho, s, thisfile, keywords, cfg, FALSE);
1887     element_close(ho, "a");
1888     element_close(ho, "li");
1889 }
1890
1891 static void html_section_title(htmloutput *ho, htmlsect *s, htmlfile *thisfile,
1892                                keywordlist *keywords, htmlconfig *cfg,
1893                                int real)
1894 {
1895     if (s->title) {
1896         sectlevel *sl;
1897         word *number;
1898         int depth = heading_depth(s->title);
1899
1900         if (depth < 0)
1901             sl = NULL;
1902         else if (depth == 0)
1903             sl = &cfg->achapter;
1904         else if (depth <= cfg->nasect)
1905             sl = &cfg->asect[depth-1];
1906         else
1907             sl = &cfg->asect[cfg->nasect-1];
1908
1909         if (!sl)
1910             number = NULL;
1911         else if (sl->just_numbers)
1912             number = s->title->kwtext2;
1913         else
1914             number = s->title->kwtext;
1915
1916         if (number) {
1917             html_words(ho, number, MARKUP,
1918                        thisfile, keywords, cfg);
1919             html_text(ho, sl->number_suffix);
1920         }
1921
1922         html_words(ho, s->title->words, real ? ALL : MARKUP,
1923                    thisfile, keywords, cfg);
1924     } else {
1925         assert(s->type != NORMAL);
1926         if (s->type == TOP)
1927             html_text(ho, L"Preamble");/* FIXME: configure */
1928         else if (s->type == INDEX)
1929             html_text(ho, L"Index");/* FIXME: configure */
1930     }
1931 }