Support for TROFF named characters, so we can have proper bullets in our lists,
[sgt/halibut] / bk_man.c
1 /*
2 * man page backend for Halibut
3 */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <assert.h>
8 #include "halibut.h"
9
10 typedef struct {
11 wchar_t *th;
12 int headnumbers;
13 int mindepth;
14 char *filename;
15 int charset;
16 wchar_t *bullet, *lquote, *rquote;
17 } manconfig;
18
19 static void man_text(FILE *, word *,
20 int newline, int quote_props, manconfig *conf);
21 static void man_codepara(FILE *, word *, int charset);
22 static int man_convert(wchar_t const *s, int maxlen,
23 char **result, int quote_props,
24 int charset, charset_state *state);
25
26 /*
27 * My TROFF reference is "NROFF/TROFF User's Manual", Joseph
28 * F. Ossana, October 11 1976.
29 *
30 * not yet used:
31 * \(ru rule
32 * \(pl math plus
33 * \(mi math minus
34 * \(eq math equals
35 * \(ga grave accent
36 * \(ul underrule
37 * \(sl slash (matching bakslash)
38 * \(br box vertical rule
39 * \(br Bell System logo
40 * \(or or
41 * all characters for constructing large brackets
42 */
43
44 static struct {
45 unsigned short uni;
46 char const *troff;
47 } const man_charmap[] = {
48 {0x00A2, "\\(ct"}, {0x00A7, "\\(sc"}, {0x00A9, "\\(co"}, {0x00AC, "\\(no"},
49 {0x00AE, "\\(rg"}, {0x00B0, "\\(de"}, {0x00B1, "\\(+-"}, {0x00B4, "\\(aa"},
50 {0x00BC, "\\(14"}, {0x00BD, "\\(12"}, {0x00BE, "\\(34"}, {0x00D7, "\\(mu"},
51 {0x00F7, "\\(di"},
52
53 {0x0391, "\\(*A"}, {0x0392, "\\(*B"}, {0x0393, "\\(*G"}, {0x0394, "\\(*D"},
54 {0x0395, "\\(*E"}, {0x0396, "\\(*Z"}, {0x0397, "\\(*Y"}, {0x0398, "\\(*H"},
55 {0x0399, "\\(*I"}, {0x039A, "\\(*K"}, {0x039B, "\\(*L"}, {0x039C, "\\(*M"},
56 {0x039D, "\\(*N"}, {0x039E, "\\(*C"}, {0x039F, "\\(*O"}, {0x03A0, "\\(*P"},
57 {0x03A1, "\\(*R"}, {0x03A3, "\\(*S"}, {0x03A4, "\\(*T"}, {0x03A5, "\\(*U"},
58 {0x03A6, "\\(*F"}, {0x03A7, "\\(*X"}, {0x03A8, "\\(*Q"}, {0x03A9, "\\(*W"},
59 {0x03B1, "\\(*a"}, {0x03B2, "\\(*b"}, {0x03B3, "\\(*g"}, {0x03B4, "\\(*d"},
60 {0x03B5, "\\(*e"}, {0x03B6, "\\(*z"}, {0x03B7, "\\(*y"}, {0x03B8, "\\(*h"},
61 {0x03B9, "\\(*i"}, {0x03BA, "\\(*k"}, {0x03BB, "\\(*l"}, {0x03BC, "\\(*m"},
62 {0x03BD, "\\(*n"}, {0x03BE, "\\(*c"}, {0x03BF, "\\(*o"}, {0x03C0, "\\(*p"},
63 {0x03C1, "\\(*r"}, {0x03C2, "\\(ts"}, {0x03C3, "\\(*s"}, {0x03C4, "\\(*t"},
64 {0x03C5, "\\(*u"}, {0x03C6, "\\(*f"}, {0x03C7, "\\(*x"}, {0x03C8, "\\(*q"},
65 {0x03C9, "\\(*w"},
66
67 {0x2014, "\\(em"}, {0x2018, "`"}, {0x2019, "'"}, {0x2020, "\\(dg"},
68 {0x2021, "\\(dd"}, {0x2022, "\\(bu"}, {0x2032, "\\(fm"},
69
70 {0x2190, "\\(<-"}, {0x2191, "\\(ua"}, {0x2192, "\\(->"}, {0x2193, "\\(da"},
71
72 {0x2202, "\\(pd"}, {0x2205, "\\(es"}, {0x2207, "\\(gr"}, {0x2208, "\\(mo"},
73 {0x2212, "\\-"}, {0x2217, "\\(**"}, {0x221A, "\\(sr"}, {0x221D, "\\(pt"},
74 {0x221E, "\\(if"}, {0x2229, "\\(ca"}, {0x222A, "\\(cu"}, {0x222B, "\\(is"},
75 {0x223C, "\\(ap"}, {0x2245, "\\(~="}, {0x2260, "\\(!="}, {0x2261, "\\(=="},
76 {0x2264, "\\(<="}, {0x2265, "\\(>="}, {0x2282, "\\(sb"}, {0x2283, "\\(sp"},
77 {0x2286, "\\(ib"}, {0x2287, "\\(ip"},
78
79 {0x25A1, "\\(sq"}, {0x25CB, "\\(ci"},
80
81 {0x261C, "\\(lh"}, {0x261E, "\\(rh"},
82 };
83
84 static char const *troffchar(int unichar) {
85 int i, j, k;
86
87 i = -1;
88 j = lenof(man_charmap);
89 while (j-i > 1) {
90 k = (i + j) / 2;
91 if (man_charmap[k].uni == unichar)
92 return man_charmap[k].troff;
93 else if (man_charmap[k].uni > unichar)
94 j = k;
95 else
96 i = k;
97 }
98 return NULL;
99 }
100
101 /*
102 * Return TRUE if we can represent the whole of the given string either
103 * in the output charset or as named characters; FALSE otherwise.
104 */
105 static int troff_ok(int charset, wchar_t *string) {
106 wchar_t test[2];
107 while (*string) {
108 test[0] = *string;
109 test[1] = 0;
110 if (!cvt_ok(charset, test) && !troffchar(*string))
111 return FALSE;
112 string++;
113 }
114 return TRUE;
115 }
116
117 static manconfig man_configure(paragraph *source) {
118 paragraph *p;
119 manconfig ret;
120
121 /*
122 * Defaults.
123 */
124 ret.th = NULL;
125 ret.headnumbers = FALSE;
126 ret.mindepth = 0;
127 ret.filename = dupstr("output.1");
128 ret.charset = CS_ASCII;
129 ret.bullet = L"\x2022\0o\0\0";
130 ret.lquote = L"\x2018\0\x2019\0\"\0\"\0\0";
131 ret.rquote = uadv(ret.lquote);
132
133 /*
134 * Two-pass configuration so that we can pick up global config
135 * (e.g. `quotes') before having it overridden by specific
136 * config (`man-quotes'), irrespective of the order in which
137 * they occur.
138 */
139 for (p = source; p; p = p->next) {
140 if (p->type == para_Config) {
141 if (!ustricmp(p->keyword, L"quotes")) {
142 if (*uadv(p->keyword) && *uadv(uadv(p->keyword))) {
143 ret.lquote = uadv(p->keyword);
144 ret.rquote = uadv(ret.lquote);
145 }
146 }
147 }
148 }
149
150 for (p = source; p; p = p->next) {
151 if (p->type == para_Config) {
152 if (!ustricmp(p->keyword, L"man-identity")) {
153 wchar_t *wp, *ep;
154
155 wp = uadv(p->keyword);
156 ep = wp;
157 while (*ep)
158 ep = uadv(ep);
159 sfree(ret.th);
160 ret.th = snewn(ep - wp + 1, wchar_t);
161 memcpy(ret.th, wp, (ep - wp + 1) * sizeof(wchar_t));
162 } else if (!ustricmp(p->keyword, L"man-charset")) {
163 ret.charset = charset_from_ustr(&p->fpos, uadv(p->keyword));
164 } else if (!ustricmp(p->keyword, L"man-headnumbers")) {
165 ret.headnumbers = utob(uadv(p->keyword));
166 } else if (!ustricmp(p->keyword, L"man-mindepth")) {
167 ret.mindepth = utoi(uadv(p->keyword));
168 } else if (!ustricmp(p->keyword, L"man-filename")) {
169 sfree(ret.filename);
170 ret.filename = dupstr(adv(p->origkeyword));
171 } else if (!ustricmp(p->keyword, L"man-bullet")) {
172 ret.bullet = uadv(p->keyword);
173 } else if (!ustricmp(p->keyword, L"man-quotes")) {
174 if (*uadv(p->keyword) && *uadv(uadv(p->keyword))) {
175 ret.lquote = uadv(p->keyword);
176 ret.rquote = uadv(ret.lquote);
177 }
178 }
179 }
180 }
181
182 /*
183 * Now process fallbacks on quote characters and bullets.
184 */
185 while (*uadv(ret.rquote) && *uadv(uadv(ret.rquote)) &&
186 (!troff_ok(ret.charset, ret.lquote) ||
187 !troff_ok(ret.charset, ret.rquote))) {
188 ret.lquote = uadv(ret.rquote);
189 ret.rquote = uadv(ret.lquote);
190 }
191
192 while (*ret.bullet && *uadv(ret.bullet) &&
193 !troff_ok(ret.charset, ret.bullet))
194 ret.bullet = uadv(ret.bullet);
195
196 return ret;
197 }
198
199 static void man_conf_cleanup(manconfig cf)
200 {
201 sfree(cf.th);
202 sfree(cf.filename);
203 }
204
205 paragraph *man_config_filename(char *filename)
206 {
207 return cmdline_cfg_simple("man-filename", filename, NULL);
208 }
209
210 #define QUOTE_INITCTRL 1 /* quote initial . and ' on a line */
211 #define QUOTE_QUOTES 2 /* quote double quotes by doubling them */
212
213 void man_backend(paragraph *sourceform, keywordlist *keywords,
214 indexdata *idx, void *unused) {
215 paragraph *p;
216 FILE *fp;
217 manconfig conf;
218 int had_described_thing;
219
220 IGNORE(unused);
221 IGNORE(keywords);
222 IGNORE(idx);
223
224 conf = man_configure(sourceform);
225
226 /*
227 * Open the output file.
228 */
229 fp = fopen(conf.filename, "w");
230 if (!fp) {
231 error(err_cantopenw, conf.filename);
232 return;
233 }
234
235 /* Do the version ID */
236 for (p = sourceform; p; p = p->next)
237 if (p->type == para_VersionID) {
238 fprintf(fp, ".\\\" ");
239 man_text(fp, p->words, TRUE, 0, &conf);
240 }
241
242 /* .TH name-of-program manual-section */
243 fprintf(fp, ".TH");
244 if (conf.th && *conf.th) {
245 char *c;
246 wchar_t *wp;
247
248 for (wp = conf.th; *wp; wp = uadv(wp)) {
249 fputs(" \"", fp);
250 man_convert(wp, 0, &c, QUOTE_QUOTES, conf.charset, NULL);
251 fputs(c, fp);
252 sfree(c);
253 fputc('"', fp);
254 }
255 }
256 fputc('\n', fp);
257
258 had_described_thing = FALSE;
259 #define cleanup_described_thing do { \
260 if (had_described_thing) \
261 fprintf(fp, "\n"); \
262 had_described_thing = FALSE; \
263 } while (0)
264
265 for (p = sourceform; p; p = p->next) switch (p->type) {
266 /*
267 * Things we ignore because we've already processed them or
268 * aren't going to touch them in this pass.
269 */
270 case para_IM:
271 case para_BR:
272 case para_Biblio: /* only touch BiblioCited */
273 case para_VersionID:
274 case para_NoCite:
275 case para_Title:
276 break;
277
278 /*
279 * Headings.
280 */
281 case para_Chapter:
282 case para_Appendix:
283 case para_UnnumberedChapter:
284 case para_Heading:
285 case para_Subsect:
286
287 cleanup_described_thing;
288 {
289 int depth;
290 if (p->type == para_Subsect)
291 depth = p->aux + 1;
292 else if (p->type == para_Heading)
293 depth = 1;
294 else
295 depth = 0;
296 if (depth >= conf.mindepth) {
297 if (depth > conf.mindepth)
298 fprintf(fp, ".SS \"");
299 else
300 fprintf(fp, ".SH \"");
301 if (conf.headnumbers && p->kwtext) {
302 man_text(fp, p->kwtext, FALSE, QUOTE_QUOTES, &conf);
303 fprintf(fp, " ");
304 }
305 man_text(fp, p->words, FALSE, QUOTE_QUOTES, &conf);
306 fprintf(fp, "\"\n");
307 }
308 break;
309 }
310
311 /*
312 * Code paragraphs.
313 */
314 case para_Code:
315 cleanup_described_thing;
316 fprintf(fp, ".PP\n");
317 man_codepara(fp, p->words, conf.charset);
318 break;
319
320 /*
321 * Normal paragraphs.
322 */
323 case para_Normal:
324 case para_Copyright:
325 cleanup_described_thing;
326 fprintf(fp, ".PP\n");
327 man_text(fp, p->words, TRUE, 0, &conf);
328 break;
329
330 /*
331 * List paragraphs.
332 */
333 case para_Description:
334 case para_BiblioCited:
335 case para_Bullet:
336 case para_NumberedList:
337 if (p->type != para_Description)
338 cleanup_described_thing;
339
340 if (p->type == para_Bullet) {
341 char *bullettext;
342 man_convert(conf.bullet, -1, &bullettext, QUOTE_QUOTES,
343 conf.charset, NULL);
344 fprintf(fp, ".IP \"\\fB%s\\fP\"\n", bullettext);
345 sfree(bullettext);
346 } else if (p->type == para_NumberedList) {
347 fprintf(fp, ".IP \"");
348 man_text(fp, p->kwtext, FALSE, QUOTE_QUOTES, &conf);
349 fprintf(fp, "\"\n");
350 } else if (p->type == para_Description) {
351 if (had_described_thing) {
352 /*
353 * Do nothing; the .xP for this paragraph is the
354 * .IP which has come before it in the
355 * DescribedThing.
356 */
357 } else {
358 /*
359 * A \dd without a preceding \dt is given a blank
360 * one.
361 */
362 fprintf(fp, ".IP \"\"\n");
363 }
364 } else if (p->type == para_BiblioCited) {
365 fprintf(fp, ".IP \"");
366 man_text(fp, p->kwtext, FALSE, QUOTE_QUOTES, &conf);
367 fprintf(fp, "\"\n");
368 }
369 man_text(fp, p->words, TRUE, 0, &conf);
370 had_described_thing = FALSE;
371 break;
372
373 case para_DescribedThing:
374 cleanup_described_thing;
375 fprintf(fp, ".IP \"");
376 man_text(fp, p->words, FALSE, QUOTE_QUOTES, &conf);
377 fprintf(fp, "\"\n");
378 had_described_thing = TRUE;
379 break;
380
381 case para_Rule:
382 /*
383 * New paragraph containing a horizontal line 1/2em above the
384 * baseline whose length is the line length minus the current
385 * indent.
386 */
387 cleanup_described_thing;
388 fprintf(fp, ".PP\n\\u\\l'\\n(.lu-\\n(.iu'\\d\n");
389 break;
390
391 case para_LcontPush:
392 case para_QuotePush:
393 cleanup_described_thing;
394 fprintf(fp, ".RS\n");
395 break;
396 case para_LcontPop:
397 case para_QuotePop:
398 cleanup_described_thing;
399 fprintf(fp, ".RE\n");
400 break;
401 }
402 cleanup_described_thing;
403
404 /*
405 * Tidy up.
406 */
407 fclose(fp);
408 man_conf_cleanup(conf);
409 }
410
411 /*
412 * Convert a wide string into a string of chars; mallocs the
413 * resulting string and stores a pointer to it in `*result'.
414 *
415 * If `state' is non-NULL, updates the charset state pointed to. If
416 * `state' is NULL, this function uses its own state, initialises
417 * it from scratch, and cleans it up when finished. If `state' is
418 * non-NULL but _s_ is NULL, cleans up a provided state.
419 *
420 * Return is nonzero if all characters are OK. If not all
421 * characters are OK but `result' is non-NULL, a result _will_
422 * still be generated!
423 *
424 * This function also does escaping of groff special characters.
425 */
426 static int man_convert(wchar_t const *s, int maxlen,
427 char **result, int quote_props,
428 int charset, charset_state *state) {
429 charset_state internal_state = CHARSET_INIT_STATE;
430 int slen, err;
431 char *p = NULL, *q;
432 int plen = 0, psize = 0;
433 rdstringc out = {0, 0, NULL};
434 int anyerr = 0;
435
436 if (!state)
437 state = &internal_state;
438
439 slen = (s ? ustrlen(s) : 0);
440
441 if (slen > maxlen && maxlen > 0)
442 slen = maxlen;
443
444 psize = 384;
445 plen = 0;
446 p = snewn(psize, char);
447 err = 0;
448
449 while (slen > 0) {
450 int ret = charset_from_unicode(&s, &slen, p, psize,
451 charset, state, &err);
452 plen = ret;
453
454 for (q = p; q < p+plen; q++) {
455 if (q == p && (*q == '.' || *q == '\'') &&
456 (quote_props & QUOTE_INITCTRL)) {
457 /*
458 * Control character (. or ') at the start of a
459 * line. Quote it by putting \& (troff zero-width
460 * space) before it.
461 */
462 rdaddc(&out, '\\');
463 rdaddc(&out, '&');
464 } else if (*q == '\\' || *q == '`') {
465 /*
466 * Quote backslashes and backticks always.
467 */
468 rdaddc(&out, '\\');
469 } else if (*q == '"' && (quote_props & QUOTE_QUOTES)) {
470 /*
471 * Double quote within double quotes. Quote it by
472 * doubling.
473 */
474 rdaddc(&out, '"');
475 }
476 rdaddc(&out, *q);
477 }
478 if (err) {
479 char const *tr = troffchar(*s);
480 if (tr == NULL)
481 anyerr = TRUE;
482 else
483 rdaddsc(&out, tr);
484 s++; slen--;
485 }
486 /* Past start of string -- no more quoting needed */
487 quote_props &= ~QUOTE_INITCTRL;
488 }
489
490 if (state == &internal_state || s == NULL) {
491 int ret = charset_from_unicode(NULL, 0, p+plen, psize-plen,
492 charset, state, NULL);
493 if (ret > 0)
494 plen += ret;
495 }
496
497 sfree(p);
498
499 if (out.text)
500 *result = rdtrimc(&out);
501 else
502 *result = dupstr("");
503
504 return !anyerr;
505 }
506
507 static int man_rdaddwc(rdstringc *rs, word *text, word *end,
508 int quote_props, manconfig *conf,
509 charset_state *state) {
510 char *c;
511
512 for (; text && text != end; text = text->next) switch (text->type) {
513 case word_HyperLink:
514 case word_HyperEnd:
515 case word_UpperXref:
516 case word_LowerXref:
517 case word_XrefEnd:
518 case word_IndexRef:
519 break;
520
521 case word_Normal:
522 case word_Emph:
523 case word_Code:
524 case word_WeakCode:
525 case word_WhiteSpace:
526 case word_EmphSpace:
527 case word_CodeSpace:
528 case word_WkCodeSpace:
529 case word_Quote:
530 case word_EmphQuote:
531 case word_CodeQuote:
532 case word_WkCodeQuote:
533 assert(text->type != word_CodeQuote &&
534 text->type != word_WkCodeQuote);
535
536 if (towordstyle(text->type) == word_Emph &&
537 (attraux(text->aux) == attr_First ||
538 attraux(text->aux) == attr_Only)) {
539 man_convert(NULL, 0, &c, quote_props, conf->charset, state);
540 rdaddsc(rs, c);
541 if (*c)
542 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
543 sfree(c);
544 *state = charset_init_state;
545 rdaddsc(rs, "\\fI");
546 } else if ((towordstyle(text->type) == word_Code ||
547 towordstyle(text->type) == word_WeakCode) &&
548 (attraux(text->aux) == attr_First ||
549 attraux(text->aux) == attr_Only)) {
550 man_convert(NULL, 0, &c, quote_props, conf->charset, state);
551 rdaddsc(rs, c);
552 if (*c)
553 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
554 sfree(c);
555 *state = charset_init_state;
556 rdaddsc(rs, "\\fB");
557 }
558
559 if (removeattr(text->type) == word_Normal) {
560 charset_state s2 = *state;
561
562 if (man_convert(text->text, 0, &c, quote_props, conf->charset, &s2) ||
563 !text->alt) {
564 rdaddsc(rs, c);
565 if (*c)
566 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
567 *state = s2;
568 } else {
569 quote_props = man_rdaddwc(rs, text->alt, NULL,
570 quote_props, conf, state);
571 }
572 sfree(c);
573 } else if (removeattr(text->type) == word_WhiteSpace) {
574 man_convert(L" ", 1, &c, quote_props, conf->charset, state);
575 rdaddsc(rs, c);
576 if (*c)
577 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
578 sfree(c);
579 } else if (removeattr(text->type) == word_Quote) {
580 man_convert(quoteaux(text->aux) == quote_Open ?
581 conf->lquote : conf->rquote, 0,
582 &c, quote_props, conf->charset, state);
583 rdaddsc(rs, c);
584 if (*c)
585 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
586 sfree(c);
587 }
588 if (towordstyle(text->type) != word_Normal &&
589 (attraux(text->aux) == attr_Last ||
590 attraux(text->aux) == attr_Only)) {
591 man_convert(NULL, 0, &c, quote_props, conf->charset, state);
592 rdaddsc(rs, c);
593 if (*c)
594 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
595 sfree(c);
596 *state = charset_init_state;
597 rdaddsc(rs, "\\fP");
598 }
599 break;
600 }
601 man_convert(NULL, 0, &c, quote_props, conf->charset, state);
602 rdaddsc(rs, c);
603 if (*c)
604 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
605 sfree(c);
606
607 return quote_props;
608 }
609
610 static void man_text(FILE *fp, word *text, int newline,
611 int quote_props, manconfig *conf) {
612 rdstringc t = { 0, 0, NULL };
613 charset_state state = CHARSET_INIT_STATE;
614
615 man_rdaddwc(&t, text, NULL, quote_props | QUOTE_INITCTRL, conf, &state);
616 fprintf(fp, "%s", t.text);
617 sfree(t.text);
618 if (newline)
619 fputc('\n', fp);
620 }
621
622 static void man_codepara(FILE *fp, word *text, int charset) {
623 fprintf(fp, ".nf\n");
624 for (; text; text = text->next) if (text->type == word_WeakCode) {
625 char *c;
626 wchar_t *t, *e;
627 int quote_props = QUOTE_INITCTRL;
628
629 t = text->text;
630 if (text->next && text->next->type == word_Emph) {
631 e = text->next->text;
632 text = text->next;
633 } else
634 e = NULL;
635
636 while (e && *e && *t) {
637 int n;
638 int ec = *e;
639
640 for (n = 0; t[n] && e[n] && e[n] == ec; n++);
641 if (ec == 'i')
642 fprintf(fp, "\\fI");
643 else if (ec == 'b')
644 fprintf(fp, "\\fB");
645 man_convert(t, n, &c, quote_props, charset, NULL);
646 quote_props &= ~QUOTE_INITCTRL;
647 fprintf(fp, "%s", c);
648 sfree(c);
649 if (ec == 'i' || ec == 'b')
650 fprintf(fp, "\\fP");
651 t += n;
652 e += n;
653 }
654 man_convert(t, 0, &c, quote_props, charset, NULL);
655 fprintf(fp, "%s\n", c);
656 sfree(c);
657 }
658 fprintf(fp, ".fi\n");
659 }