Charset support for the man page backend (\cfg{man-charset}).
[sgt/halibut] / bk_man.c
1 /*
2 * man page backend for Halibut
3 */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <assert.h>
8 #include "halibut.h"
9
10 static void man_text(FILE *, word *,
11 int newline, int quote_props, int charset);
12 static void man_codepara(FILE *, word *, int charset);
13 static int man_convert(wchar_t const *s, int maxlen,
14 char **result, int quote_props,
15 int charset, charset_state *state);
16
17 typedef struct {
18 wchar_t *th;
19 int headnumbers;
20 int mindepth;
21 char *filename;
22 int charset;
23 } manconfig;
24
25 static manconfig man_configure(paragraph *source) {
26 manconfig ret;
27
28 /*
29 * Defaults.
30 */
31 ret.th = NULL;
32 ret.headnumbers = FALSE;
33 ret.mindepth = 0;
34 ret.filename = dupstr("output.1");
35 ret.charset = CS_ASCII;
36
37 for (; source; source = source->next) {
38 if (source->type == para_Config) {
39 if (!ustricmp(source->keyword, L"man-identity")) {
40 wchar_t *wp, *ep;
41
42 wp = uadv(source->keyword);
43 ep = wp;
44 while (*ep)
45 ep = uadv(ep);
46 sfree(ret.th);
47 ret.th = mknewa(wchar_t, ep - wp + 1);
48 memcpy(ret.th, wp, (ep - wp + 1) * sizeof(wchar_t));
49 } else if (!ustricmp(source->keyword, L"man-charset")) {
50 char *csname = utoa_dup(uadv(source->keyword), CS_ASCII);
51 ret.charset = charset_from_localenc(csname);
52 sfree(csname);
53 } else if (!ustricmp(source->keyword, L"man-headnumbers")) {
54 ret.headnumbers = utob(uadv(source->keyword));
55 } else if (!ustricmp(source->keyword, L"man-mindepth")) {
56 ret.mindepth = utoi(uadv(source->keyword));
57 } else if (!ustricmp(source->keyword, L"man-filename")) {
58 sfree(ret.filename);
59 ret.filename = dupstr(adv(source->origkeyword));
60 }
61 }
62 }
63
64 return ret;
65 }
66
67 static void man_conf_cleanup(manconfig cf)
68 {
69 sfree(cf.th);
70 sfree(cf.filename);
71 }
72
73 paragraph *man_config_filename(char *filename)
74 {
75 return cmdline_cfg_simple("man-filename", filename, NULL);
76 }
77
78 #define QUOTE_INITCTRL 1 /* quote initial . and ' on a line */
79 #define QUOTE_QUOTES 2 /* quote double quotes by doubling them */
80
81 void man_backend(paragraph *sourceform, keywordlist *keywords,
82 indexdata *idx, void *unused) {
83 paragraph *p;
84 FILE *fp;
85 manconfig conf;
86
87 IGNORE(unused);
88 IGNORE(keywords);
89 IGNORE(idx);
90
91 conf = man_configure(sourceform);
92
93 /*
94 * Open the output file.
95 */
96 fp = fopen(conf.filename, "w");
97 if (!fp) {
98 error(err_cantopenw, conf.filename);
99 return;
100 }
101
102 /* Do the version ID */
103 for (p = sourceform; p; p = p->next)
104 if (p->type == para_VersionID) {
105 fprintf(fp, ".\\\" ");
106 man_text(fp, p->words, TRUE, 0, conf.charset);
107 }
108
109 /* .TH name-of-program manual-section */
110 fprintf(fp, ".TH");
111 if (conf.th && *conf.th) {
112 char *c;
113 wchar_t *wp;
114
115 for (wp = conf.th; *wp; wp = uadv(wp)) {
116 fputs(" \"", fp);
117 man_convert(wp, 0, &c, QUOTE_QUOTES, conf.charset, NULL);
118 fputs(c, fp);
119 sfree(c);
120 fputc('"', fp);
121 }
122 }
123 fputc('\n', fp);
124
125 fprintf(fp, ".UC\n");
126
127 for (p = sourceform; p; p = p->next) switch (p->type) {
128 /*
129 * Things we ignore because we've already processed them or
130 * aren't going to touch them in this pass.
131 */
132 case para_IM:
133 case para_BR:
134 case para_Biblio: /* only touch BiblioCited */
135 case para_VersionID:
136 case para_NoCite:
137 case para_Title:
138 break;
139
140 /*
141 * Headings.
142 */
143 case para_Chapter:
144 case para_Appendix:
145 case para_UnnumberedChapter:
146 case para_Heading:
147 case para_Subsect:
148
149 {
150 int depth;
151 if (p->type == para_Subsect)
152 depth = p->aux + 2;
153 else if (p->type == para_Heading)
154 depth = 1;
155 else
156 depth = 0;
157 if (depth >= conf.mindepth) {
158 fprintf(fp, ".SH \"");
159 if (conf.headnumbers && p->kwtext) {
160 man_text(fp, p->kwtext, FALSE, QUOTE_QUOTES, conf.charset);
161 fprintf(fp, " ");
162 }
163 man_text(fp, p->words, FALSE, QUOTE_QUOTES, conf.charset);
164 fprintf(fp, "\"\n");
165 }
166 break;
167 }
168
169 /*
170 * Code paragraphs.
171 */
172 case para_Code:
173 fprintf(fp, ".PP\n");
174 man_codepara(fp, p->words, conf.charset);
175 break;
176
177 /*
178 * Normal paragraphs.
179 */
180 case para_Normal:
181 case para_Copyright:
182 fprintf(fp, ".PP\n");
183 man_text(fp, p->words, TRUE, 0, conf.charset);
184 break;
185
186 /*
187 * List paragraphs.
188 */
189 case para_Description:
190 case para_BiblioCited:
191 case para_Bullet:
192 case para_NumberedList:
193 if (p->type == para_Bullet) {
194 fprintf(fp, ".IP \"\\fBo\\fP\"\n"); /* FIXME: configurable? */
195 } else if (p->type == para_NumberedList) {
196 fprintf(fp, ".IP \"");
197 man_text(fp, p->kwtext, FALSE, QUOTE_QUOTES, conf.charset);
198 fprintf(fp, "\"\n");
199 } else if (p->type == para_Description) {
200 /*
201 * Do nothing; the .xP for this paragraph is the .IP
202 * which has come before it in the DescribedThing.
203 */
204 } else if (p->type == para_BiblioCited) {
205 fprintf(fp, ".IP \"");
206 man_text(fp, p->kwtext, FALSE, QUOTE_QUOTES, conf.charset);
207 fprintf(fp, "\"\n");
208 }
209 man_text(fp, p->words, TRUE, 0, conf.charset);
210 break;
211
212 case para_DescribedThing:
213 fprintf(fp, ".IP \"");
214 man_text(fp, p->words, FALSE, QUOTE_QUOTES, conf.charset);
215 fprintf(fp, "\"\n");
216 break;
217
218 case para_Rule:
219 /*
220 * This isn't terribly good. Anyone who wants to do better
221 * should feel free!
222 */
223 fprintf(fp, ".PP\n----------------------------------------\n");
224 break;
225
226 case para_LcontPush:
227 case para_QuotePush:
228 fprintf(fp, ".RS\n");
229 break;
230 case para_LcontPop:
231 case para_QuotePop:
232 fprintf(fp, ".RE\n");
233 break;
234 }
235
236 /*
237 * Tidy up.
238 */
239 fclose(fp);
240 man_conf_cleanup(conf);
241 }
242
243 /*
244 * Convert a wide string into a string of chars; mallocs the
245 * resulting string and stores a pointer to it in `*result'.
246 *
247 * If `state' is non-NULL, updates the charset state pointed to. If
248 * `state' is NULL, this function uses its own state, initialises
249 * it from scratch, and cleans it up when finished. If `state' is
250 * non-NULL but _s_ is NULL, cleans up a provided state.
251 *
252 * Return is nonzero if all characters are OK. If not all
253 * characters are OK but `result' is non-NULL, a result _will_
254 * still be generated!
255 *
256 * This function also does escaping of groff special characters.
257 */
258 static int man_convert(wchar_t const *s, int maxlen,
259 char **result, int quote_props,
260 int charset, charset_state *state) {
261 charset_state internal_state = CHARSET_INIT_STATE;
262 int slen, err;
263 char *p = NULL, *q;
264 int plen = 0, psize = 0;
265 rdstringc out = {0, 0, NULL};
266
267 if (!state)
268 state = &internal_state;
269
270 slen = (s ? ustrlen(s) : 0);
271
272 if (slen > maxlen && maxlen > 0)
273 slen = maxlen;
274
275 psize = 384;
276 plen = 0;
277 p = mknewa(char, psize);
278 err = 0;
279
280 while (slen > 0) {
281 int ret = charset_from_unicode(&s, &slen, p+plen, psize-plen,
282 charset, state, (err ? NULL : &err));
283 if (ret > 0) {
284 plen += ret;
285 if (psize - plen < 256) {
286 psize = plen + 256;
287 p = resize(p, psize);
288 }
289 }
290 }
291
292 if (state == &internal_state || s == NULL) {
293 int ret = charset_from_unicode(NULL, 0, p+plen, psize-plen,
294 charset, state, NULL);
295 if (ret > 0)
296 plen += ret;
297 }
298
299 for (q = p; q < p+plen; q++) {
300 if (q == p && (*q == '.' || *q == '\'') &&
301 (quote_props & QUOTE_INITCTRL)) {
302 /*
303 * Control character (. or ') at the start of a
304 * line. Quote it by putting \& (troff zero-width
305 * space) before it.
306 */
307 rdaddc(&out, '\\');
308 rdaddc(&out, '&');
309 } else if (*q == '\\') {
310 /*
311 * Quote backslashes by doubling them, always.
312 */
313 rdaddc(&out, '\\');
314 } else if (*q == '"' && (quote_props & QUOTE_QUOTES)) {
315 /*
316 * Double quote within double quotes. Quote it by
317 * doubling.
318 */
319 rdaddc(&out, '"');
320 }
321 rdaddc(&out, *q);
322 }
323
324 sfree(p);
325
326 if (out.text)
327 *result = rdtrimc(&out);
328 else
329 *result = dupstr("");
330
331 return !err;
332 }
333
334 static void man_rdaddwc(rdstringc *rs, word *text, word *end,
335 int quote_props, int charset, charset_state *state) {
336 char *c;
337
338 for (; text && text != end; text = text->next) switch (text->type) {
339 case word_HyperLink:
340 case word_HyperEnd:
341 case word_UpperXref:
342 case word_LowerXref:
343 case word_XrefEnd:
344 case word_IndexRef:
345 break;
346
347 case word_Normal:
348 case word_Emph:
349 case word_Code:
350 case word_WeakCode:
351 case word_WhiteSpace:
352 case word_EmphSpace:
353 case word_CodeSpace:
354 case word_WkCodeSpace:
355 case word_Quote:
356 case word_EmphQuote:
357 case word_CodeQuote:
358 case word_WkCodeQuote:
359 assert(text->type != word_CodeQuote &&
360 text->type != word_WkCodeQuote);
361
362 if (towordstyle(text->type) == word_Emph &&
363 (attraux(text->aux) == attr_First ||
364 attraux(text->aux) == attr_Only)) {
365 if (rs->pos > 0)
366 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
367 man_convert(NULL, 0, &c, quote_props, charset, state);
368 rdaddsc(rs, c);
369 sfree(c);
370 *state = charset_init_state;
371 rdaddsc(rs, "\\fI");
372 } else if ((towordstyle(text->type) == word_Code ||
373 towordstyle(text->type) == word_WeakCode) &&
374 (attraux(text->aux) == attr_First ||
375 attraux(text->aux) == attr_Only)) {
376 if (rs->pos > 0)
377 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
378 man_convert(NULL, 0, &c, quote_props, charset, state);
379 rdaddsc(rs, c);
380 sfree(c);
381 *state = charset_init_state;
382 rdaddsc(rs, "\\fB");
383 }
384
385 if (removeattr(text->type) == word_Normal) {
386 charset_state s2 = *state;
387
388 if (rs->pos > 0)
389 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
390 if (man_convert(text->text, 0, &c, quote_props, charset, &s2) ||
391 !text->alt) {
392 rdaddsc(rs, c);
393 *state = s2;
394 } else {
395 man_rdaddwc(rs, text->alt, NULL, quote_props, charset, state);
396 }
397 sfree(c);
398 } else if (removeattr(text->type) == word_WhiteSpace) {
399 if (rs->pos > 0)
400 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
401 man_convert(L" ", 1, &c, quote_props, charset, state);
402 rdaddsc(rs, c);
403 sfree(c);
404 } else if (removeattr(text->type) == word_Quote) {
405 if (rs->pos > 0)
406 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
407 man_convert(L"\"", 1, &c, quote_props, charset, state);
408 rdaddsc(rs, c);
409 sfree(c);
410 }
411 if (towordstyle(text->type) != word_Normal &&
412 (attraux(text->aux) == attr_Last ||
413 attraux(text->aux) == attr_Only)) {
414 if (rs->pos > 0)
415 quote_props &= ~QUOTE_INITCTRL; /* not at start any more */
416 man_convert(NULL, 0, &c, quote_props, charset, state);
417 rdaddsc(rs, c);
418 sfree(c);
419 *state = charset_init_state;
420 rdaddsc(rs, "\\fP");
421 }
422 break;
423 }
424 man_convert(NULL, 0, &c, quote_props, charset, state);
425 rdaddsc(rs, c);
426 sfree(c);
427 }
428
429 static void man_text(FILE *fp, word *text, int newline,
430 int quote_props, int charset) {
431 rdstringc t = { 0, 0, NULL };
432 charset_state state = CHARSET_INIT_STATE;
433
434 man_rdaddwc(&t, text, NULL, quote_props | QUOTE_INITCTRL, charset, &state);
435 fprintf(fp, "%s", t.text);
436 sfree(t.text);
437 if (newline)
438 fputc('\n', fp);
439 }
440
441 static void man_codepara(FILE *fp, word *text, int charset) {
442 fprintf(fp, ".nf\n");
443 for (; text; text = text->next) if (text->type == word_WeakCode) {
444 char *c;
445 wchar_t *t, *e;
446 int quote_props = QUOTE_INITCTRL;
447
448 t = text->text;
449 if (text->next && text->next->type == word_Emph) {
450 e = text->next->text;
451 text = text->next;
452 } else
453 e = NULL;
454
455 while (e && *e && *t) {
456 int n;
457 int ec = *e;
458
459 for (n = 0; t[n] && e[n] && e[n] == ec; n++);
460 if (ec == 'i')
461 fprintf(fp, "\\fI");
462 else if (ec == 'b')
463 fprintf(fp, "\\fB");
464 man_convert(t, n, &c, quote_props, charset, NULL);
465 quote_props &= ~QUOTE_INITCTRL;
466 fprintf(fp, "%s", c);
467 sfree(c);
468 if (ec == 'i' || ec == 'b')
469 fprintf(fp, "\\fP");
470 t += n;
471 e += n;
472 }
473 man_convert(t, 0, &c, quote_props, charset, NULL);
474 fprintf(fp, "%s\n", c);
475 sfree(c);
476 }
477 fprintf(fp, ".fi\n");
478 }