Text backend is now charset-enabled: the entire text file is built
[sgt/halibut] / bk_text.c
1 /*
2 * text backend for Halibut
3 */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <assert.h>
8 #include "halibut.h"
9
10 typedef enum { LEFT, LEFTPLUS, CENTRE } alignment;
11 typedef struct {
12 alignment align;
13 int just_numbers;
14 wchar_t underline;
15 wchar_t *number_suffix;
16 } alignstruct;
17
18 typedef struct {
19 int indent, indent_code;
20 int listindentbefore, listindentafter;
21 int width;
22 alignstruct atitle, achapter, *asect;
23 int nasect;
24 int include_version_id;
25 int indent_preambles;
26 int charset;
27 word bullet;
28 char *filename;
29 } textconfig;
30
31 typedef struct {
32 FILE *fp;
33 int charset;
34 charset_state state;
35 } textfile;
36
37 static void text_heading(textfile *, word *, word *, word *, alignstruct,
38 int,int);
39 static void text_rule(textfile *, int, int);
40 static void text_para(textfile *, word *, wchar_t *, word *, int, int, int);
41 static void text_codepara(textfile *, word *, int, int);
42 static void text_versionid(textfile *, word *);
43
44 static void text_output(textfile *, const wchar_t *);
45 static void text_output_many(textfile *, int, wchar_t);
46
47 static alignment utoalign(wchar_t *p) {
48 if (!ustricmp(p, L"centre") || !ustricmp(p, L"center"))
49 return CENTRE;
50 if (!ustricmp(p, L"leftplus"))
51 return LEFTPLUS;
52 return LEFT;
53 }
54
55 static textconfig text_configure(paragraph *source) {
56 textconfig ret;
57
58 /*
59 * Non-negotiables.
60 */
61 ret.bullet.next = NULL;
62 ret.bullet.alt = NULL;
63 ret.bullet.type = word_Normal;
64 ret.atitle.just_numbers = FALSE; /* ignored */
65
66 /*
67 * Defaults.
68 */
69 ret.indent = 7;
70 ret.indent_code = 2;
71 ret.listindentbefore = 1;
72 ret.listindentafter = 3;
73 ret.width = 68;
74 ret.atitle.align = CENTRE;
75 ret.atitle.underline = L'=';
76 ret.achapter.align = LEFT;
77 ret.achapter.just_numbers = FALSE;
78 ret.achapter.number_suffix = L": ";
79 ret.achapter.underline = L'-';
80 ret.nasect = 1;
81 ret.asect = mknewa(alignstruct, ret.nasect);
82 ret.asect[0].align = LEFTPLUS;
83 ret.asect[0].just_numbers = TRUE;
84 ret.asect[0].number_suffix = L" ";
85 ret.asect[0].underline = L'\0';
86 ret.include_version_id = TRUE;
87 ret.indent_preambles = FALSE;
88 ret.bullet.text = L"-";
89 ret.filename = dupstr("output.txt");
90 ret.charset = CS_ASCII;
91
92 for (; source; source = source->next) {
93 if (source->type == para_Config) {
94 if (!ustricmp(source->keyword, L"text-indent")) {
95 ret.indent = utoi(uadv(source->keyword));
96 } else if (!ustricmp(source->keyword, L"text-charset")) {
97 char *csname = utoa_dup(uadv(source->keyword), CS_ASCII);
98 ret.charset = charset_from_localenc(csname);
99 sfree(csname);
100 } else if (!ustricmp(source->keyword, L"text-filename")) {
101 sfree(ret.filename);
102 ret.filename = dupstr(adv(source->origkeyword));
103 } else if (!ustricmp(source->keyword, L"text-indent-code")) {
104 ret.indent_code = utoi(uadv(source->keyword));
105 } else if (!ustricmp(source->keyword, L"text-width")) {
106 ret.width = utoi(uadv(source->keyword));
107 } else if (!ustricmp(source->keyword, L"text-list-indent")) {
108 ret.listindentbefore = utoi(uadv(source->keyword));
109 } else if (!ustricmp(source->keyword, L"text-listitem-indent")) {
110 ret.listindentafter = utoi(uadv(source->keyword));
111 } else if (!ustricmp(source->keyword, L"text-chapter-align")) {
112 ret.achapter.align = utoalign(uadv(source->keyword));
113 } else if (!ustricmp(source->keyword, L"text-chapter-underline")) {
114 ret.achapter.underline = *uadv(source->keyword);
115 } else if (!ustricmp(source->keyword, L"text-chapter-numeric")) {
116 ret.achapter.just_numbers = utob(uadv(source->keyword));
117 } else if (!ustricmp(source->keyword, L"text-chapter-suffix")) {
118 ret.achapter.number_suffix = uadv(source->keyword);
119 } else if (!ustricmp(source->keyword, L"text-section-align")) {
120 wchar_t *p = uadv(source->keyword);
121 int n = 0;
122 if (uisdigit(*p)) {
123 n = utoi(p);
124 p = uadv(p);
125 }
126 if (n >= ret.nasect) {
127 int i;
128 ret.asect = resize(ret.asect, n+1);
129 for (i = ret.nasect; i <= n; i++)
130 ret.asect[i] = ret.asect[ret.nasect-1];
131 ret.nasect = n+1;
132 }
133 ret.asect[n].align = utoalign(p);
134 } else if (!ustricmp(source->keyword, L"text-section-underline")) {
135 wchar_t *p = uadv(source->keyword);
136 int n = 0;
137 if (uisdigit(*p)) {
138 n = utoi(p);
139 p = uadv(p);
140 }
141 if (n >= ret.nasect) {
142 int i;
143 ret.asect = resize(ret.asect, n+1);
144 for (i = ret.nasect; i <= n; i++)
145 ret.asect[i] = ret.asect[ret.nasect-1];
146 ret.nasect = n+1;
147 }
148 ret.asect[n].underline = *p;
149 } else if (!ustricmp(source->keyword, L"text-section-numeric")) {
150 wchar_t *p = uadv(source->keyword);
151 int n = 0;
152 if (uisdigit(*p)) {
153 n = utoi(p);
154 p = uadv(p);
155 }
156 if (n >= ret.nasect) {
157 int i;
158 ret.asect = resize(ret.asect, n+1);
159 for (i = ret.nasect; i <= n; i++)
160 ret.asect[i] = ret.asect[ret.nasect-1];
161 ret.nasect = n+1;
162 }
163 ret.asect[n].just_numbers = utob(p);
164 } else if (!ustricmp(source->keyword, L"text-section-suffix")) {
165 wchar_t *p = uadv(source->keyword);
166 int n = 0;
167 if (uisdigit(*p)) {
168 n = utoi(p);
169 p = uadv(p);
170 }
171 if (n >= ret.nasect) {
172 int i;
173 ret.asect = resize(ret.asect, n+1);
174 for (i = ret.nasect; i <= n; i++) {
175 ret.asect[i] = ret.asect[ret.nasect-1];
176 }
177 ret.nasect = n+1;
178 }
179 ret.asect[n].number_suffix = p;
180 } else if (!ustricmp(source->keyword, L"text-title-align")) {
181 ret.atitle.align = utoalign(uadv(source->keyword));
182 } else if (!ustricmp(source->keyword, L"text-title-underline")) {
183 ret.atitle.underline = *uadv(source->keyword);
184 } else if (!ustricmp(source->keyword, L"text-versionid")) {
185 ret.include_version_id = utob(uadv(source->keyword));
186 } else if (!ustricmp(source->keyword, L"text-indent-preamble")) {
187 ret.indent_preambles = utob(uadv(source->keyword));
188 } else if (!ustricmp(source->keyword, L"text-bullet")) {
189 ret.bullet.text = uadv(source->keyword);
190 }
191 }
192 }
193
194 return ret;
195 }
196
197 paragraph *text_config_filename(char *filename)
198 {
199 return cmdline_cfg_simple("text-filename", filename, NULL);
200 }
201
202 void text_backend(paragraph *sourceform, keywordlist *keywords,
203 indexdata *idx, void *unused) {
204 paragraph *p;
205 textconfig conf;
206 word *prefix, *body, *wp;
207 word spaceword;
208 textfile tf;
209 wchar_t *prefixextra;
210 int nesting, nestindent;
211 int indentb, indenta;
212
213 IGNORE(unused);
214 IGNORE(keywords); /* we don't happen to need this */
215 IGNORE(idx); /* or this */
216
217 conf = text_configure(sourceform);
218
219 /*
220 * Open the output file.
221 */
222 tf.fp = fopen(conf.filename, "w");
223 if (!tf.fp) {
224 error(err_cantopenw, conf.filename);
225 return;
226 }
227 tf.charset = conf.charset;
228 tf.state = charset_init_state;
229
230 /* Do the title */
231 for (p = sourceform; p; p = p->next)
232 if (p->type == para_Title)
233 text_heading(&tf, NULL, NULL, p->words,
234 conf.atitle, conf.indent, conf.width);
235
236 nestindent = conf.listindentbefore + conf.listindentafter;
237 nesting = (conf.indent_preambles ? 0 : -conf.indent);
238
239 /* Do the main document */
240 for (p = sourceform; p; p = p->next) switch (p->type) {
241
242 case para_QuotePush:
243 nesting += 2;
244 break;
245 case para_QuotePop:
246 nesting -= 2;
247 assert(nesting >= 0);
248 break;
249
250 case para_LcontPush:
251 nesting += nestindent;
252 break;
253 case para_LcontPop:
254 nesting -= nestindent;
255 assert(nesting >= 0);
256 break;
257
258 /*
259 * Things we ignore because we've already processed them or
260 * aren't going to touch them in this pass.
261 */
262 case para_IM:
263 case para_BR:
264 case para_Biblio: /* only touch BiblioCited */
265 case para_VersionID:
266 case para_NoCite:
267 case para_Title:
268 break;
269
270 /*
271 * Chapter titles.
272 */
273 case para_Chapter:
274 case para_Appendix:
275 case para_UnnumberedChapter:
276 text_heading(&tf, p->kwtext, p->kwtext2, p->words,
277 conf.achapter, conf.indent, conf.width);
278 nesting = 0;
279 break;
280
281 case para_Heading:
282 case para_Subsect:
283 text_heading(&tf, p->kwtext, p->kwtext2, p->words,
284 conf.asect[p->aux>=conf.nasect ? conf.nasect-1 : p->aux],
285 conf.indent, conf.width);
286 break;
287
288 case para_Rule:
289 text_rule(&tf, conf.indent + nesting, conf.width - nesting);
290 break;
291
292 case para_Normal:
293 case para_Copyright:
294 case para_DescribedThing:
295 case para_Description:
296 case para_BiblioCited:
297 case para_Bullet:
298 case para_NumberedList:
299 if (p->type == para_Bullet) {
300 prefix = &conf.bullet;
301 prefixextra = NULL;
302 indentb = conf.listindentbefore;
303 indenta = conf.listindentafter;
304 } else if (p->type == para_NumberedList) {
305 prefix = p->kwtext;
306 prefixextra = L"."; /* FIXME: configurability */
307 indentb = conf.listindentbefore;
308 indenta = conf.listindentafter;
309 } else if (p->type == para_Description) {
310 prefix = NULL;
311 prefixextra = NULL;
312 indentb = conf.listindentbefore;
313 indenta = conf.listindentafter;
314 } else {
315 prefix = NULL;
316 prefixextra = NULL;
317 indentb = indenta = 0;
318 }
319 if (p->type == para_BiblioCited) {
320 body = dup_word_list(p->kwtext);
321 for (wp = body; wp->next; wp = wp->next);
322 wp->next = &spaceword;
323 spaceword.next = p->words;
324 spaceword.alt = NULL;
325 spaceword.type = word_WhiteSpace;
326 spaceword.text = NULL;
327 } else {
328 wp = NULL;
329 body = p->words;
330 }
331 text_para(&tf, prefix, prefixextra, body,
332 conf.indent + nesting + indentb, indenta,
333 conf.width - nesting - indentb - indenta);
334 if (wp) {
335 wp->next = NULL;
336 free_word_list(body);
337 }
338 break;
339
340 case para_Code:
341 text_codepara(&tf, p->words,
342 conf.indent + nesting + conf.indent_code,
343 conf.width - nesting - 2 * conf.indent_code);
344 break;
345 }
346
347 /* Do the version ID */
348 if (conf.include_version_id) {
349 for (p = sourceform; p; p = p->next)
350 if (p->type == para_VersionID)
351 text_versionid(&tf, p->words);
352 }
353
354 /*
355 * Tidy up
356 */
357 text_output(&tf, NULL); /* end charset conversion */
358 fclose(tf.fp);
359 sfree(conf.asect);
360 sfree(conf.filename);
361 }
362
363 static int text_ok(int charset, const wchar_t *s)
364 {
365 char buf[256];
366 charset_state state = CHARSET_INIT_STATE;
367 int err, len = ustrlen(s);
368
369 err = 0;
370 while (len > 0) {
371 (void)charset_from_unicode(&s, &len, buf, lenof(buf),
372 charset, &state, &err);
373 if (err)
374 return FALSE;
375 }
376 return TRUE;
377 }
378
379 static void text_output(textfile *tf, const wchar_t *s)
380 {
381 char buf[256];
382 int ret, len;
383 const wchar_t **sp;
384
385 if (!s) {
386 sp = NULL;
387 len = 1;
388 } else {
389 sp = &s;
390 len = ustrlen(s);
391 }
392
393 while (len > 0) {
394 ret = charset_from_unicode(sp, &len, buf, lenof(buf),
395 tf->charset, &tf->state, NULL);
396 if (!sp)
397 len = 0;
398 fwrite(buf, 1, ret, tf->fp);
399 }
400 }
401
402 static void text_output_many(textfile *tf, int n, wchar_t c)
403 {
404 wchar_t s[2];
405 s[0] = c;
406 s[1] = L'\0';
407 while (n--)
408 text_output(tf, s);
409 }
410
411 static void text_rdaddw(int charset, rdstring *rs, word *text, word *end) {
412 for (; text && text != end; text = text->next) switch (text->type) {
413 case word_HyperLink:
414 case word_HyperEnd:
415 case word_UpperXref:
416 case word_LowerXref:
417 case word_XrefEnd:
418 case word_IndexRef:
419 break;
420
421 case word_Normal:
422 case word_Emph:
423 case word_Code:
424 case word_WeakCode:
425 case word_WhiteSpace:
426 case word_EmphSpace:
427 case word_CodeSpace:
428 case word_WkCodeSpace:
429 case word_Quote:
430 case word_EmphQuote:
431 case word_CodeQuote:
432 case word_WkCodeQuote:
433 assert(text->type != word_CodeQuote &&
434 text->type != word_WkCodeQuote);
435 if (towordstyle(text->type) == word_Emph &&
436 (attraux(text->aux) == attr_First ||
437 attraux(text->aux) == attr_Only))
438 rdadd(rs, L'_'); /* FIXME: configurability */
439 else if (towordstyle(text->type) == word_Code &&
440 (attraux(text->aux) == attr_First ||
441 attraux(text->aux) == attr_Only))
442 rdadd(rs, L'`'); /* FIXME: configurability */
443 if (removeattr(text->type) == word_Normal) {
444 if (text_ok(charset, text->text) || !text->alt)
445 rdadds(rs, text->text);
446 else
447 text_rdaddw(charset, rs, text->alt, NULL);
448 } else if (removeattr(text->type) == word_WhiteSpace) {
449 rdadd(rs, L' ');
450 } else if (removeattr(text->type) == word_Quote) {
451 rdadd(rs, quoteaux(text->aux) == quote_Open ? L'`' : L'\'');
452 /* FIXME: configurability */
453 }
454 if (towordstyle(text->type) == word_Emph &&
455 (attraux(text->aux) == attr_Last ||
456 attraux(text->aux) == attr_Only))
457 rdadd(rs, L'_'); /* FIXME: configurability */
458 else if (towordstyle(text->type) == word_Code &&
459 (attraux(text->aux) == attr_Last ||
460 attraux(text->aux) == attr_Only))
461 rdadd(rs, L'\''); /* FIXME: configurability */
462 break;
463 }
464 }
465
466 static int text_width(void *, word *);
467
468 static int text_width_list(void *ctx, word *text) {
469 int w = 0;
470 while (text) {
471 w += text_width(ctx, text);
472 text = text->next;
473 }
474 return w;
475 }
476
477 static int text_width(void *ctx, word *text) {
478 int charset = * (int *) ctx;
479
480 switch (text->type) {
481 case word_HyperLink:
482 case word_HyperEnd:
483 case word_UpperXref:
484 case word_LowerXref:
485 case word_XrefEnd:
486 case word_IndexRef:
487 return 0;
488
489 case word_Normal:
490 case word_Emph:
491 case word_Code:
492 case word_WeakCode:
493 return (((text->type == word_Emph ||
494 text->type == word_Code)
495 ? (attraux(text->aux) == attr_Only ? 2 :
496 attraux(text->aux) == attr_Always ? 0 : 1)
497 : 0) +
498 (text_ok(charset, text->text) || !text->alt ?
499 ustrlen(text->text) :
500 text_width_list(ctx, text->alt)));
501
502 case word_WhiteSpace:
503 case word_EmphSpace:
504 case word_CodeSpace:
505 case word_WkCodeSpace:
506 case word_Quote:
507 case word_EmphQuote:
508 case word_CodeQuote:
509 case word_WkCodeQuote:
510 assert(text->type != word_CodeQuote &&
511 text->type != word_WkCodeQuote);
512 return (((towordstyle(text->type) == word_Emph ||
513 towordstyle(text->type) == word_Code)
514 ? (attraux(text->aux) == attr_Only ? 2 :
515 attraux(text->aux) == attr_Always ? 0 : 1)
516 : 0) + 1);
517 }
518 return 0; /* should never happen */
519 }
520
521 static void text_heading(textfile *tf, word *tprefix, word *nprefix,
522 word *text, alignstruct align,
523 int indent, int width) {
524 rdstring t = { 0, 0, NULL };
525 int margin, length;
526 int firstlinewidth, wrapwidth;
527 wrappedline *wrapping, *p;
528
529 if (align.just_numbers && nprefix) {
530 text_rdaddw(tf->charset, &t, nprefix, NULL);
531 rdadds(&t, align.number_suffix);
532 } else if (!align.just_numbers && tprefix) {
533 text_rdaddw(tf->charset, &t, tprefix, NULL);
534 rdadds(&t, align.number_suffix);
535 }
536 margin = length = t.pos;
537
538 if (align.align == LEFTPLUS) {
539 margin = indent - margin;
540 if (margin < 0) margin = 0;
541 firstlinewidth = indent + width - margin - length;
542 wrapwidth = width;
543 } else if (align.align == LEFT || align.align == CENTRE) {
544 margin = 0;
545 firstlinewidth = indent + width - length;
546 wrapwidth = indent + width;
547 }
548
549 wrapping = wrap_para(text, firstlinewidth, wrapwidth,
550 text_width, &tf->charset, 0);
551 for (p = wrapping; p; p = p->next) {
552 text_rdaddw(tf->charset, &t, p->begin, p->end);
553 length = t.pos;
554 if (align.align == CENTRE) {
555 margin = (indent + width - length)/2;
556 if (margin < 0) margin = 0;
557 }
558 text_output_many(tf, margin, L' ');
559 text_output(tf, t.text);
560 text_output(tf, L"\n");
561 if (align.underline != L'\0') {
562 text_output_many(tf, margin, L' ');
563 text_output_many(tf, length, align.underline);
564 text_output(tf, L"\n");
565 }
566 if (align.align == LEFTPLUS)
567 margin = indent;
568 else
569 margin = 0;
570 sfree(t.text);
571 t = empty_rdstring;
572 }
573 wrap_free(wrapping);
574 text_output(tf, L"\n");
575
576 sfree(t.text);
577 }
578
579 static void text_rule(textfile *tf, int indent, int width) {
580 text_output_many(tf, indent, L' ');
581 text_output_many(tf, width, L'-'); /* FIXME: configurability! */
582 text_output_many(tf, 2, L'\n');
583 }
584
585 static void text_para(textfile *tf, word *prefix, wchar_t *prefixextra,
586 word *text, int indent, int extraindent, int width) {
587 wrappedline *wrapping, *p;
588 rdstring pfx = { 0, 0, NULL };
589 int e;
590 int firstlinewidth = width;
591
592 if (prefix) {
593 text_rdaddw(tf->charset, &pfx, prefix, NULL);
594 if (prefixextra)
595 rdadds(&pfx, prefixextra);
596 text_output_many(tf, indent, L' ');
597 text_output(tf, pfx.text);
598 /* If the prefix is too long, shorten the first line to fit. */
599 e = extraindent - pfx.pos;
600 if (e < 0) {
601 firstlinewidth += e; /* this decreases it, since e < 0 */
602 if (firstlinewidth < 0) {
603 e = indent + extraindent;
604 firstlinewidth = width;
605 text_output(tf, L"\n");
606 } else
607 e = 0;
608 }
609 sfree(pfx.text);
610 } else
611 e = indent + extraindent;
612
613 wrapping = wrap_para(text, firstlinewidth, width,
614 text_width, &tf->charset, 0);
615 for (p = wrapping; p; p = p->next) {
616 rdstring t = { 0, 0, NULL };
617 text_rdaddw(tf->charset, &t, p->begin, p->end);
618 text_output_many(tf, e, L' ');
619 text_output(tf, t.text);
620 text_output(tf, L"\n");
621 e = indent + extraindent;
622 sfree(t.text);
623 }
624 wrap_free(wrapping);
625 text_output(tf, L"\n");
626 }
627
628 static void text_codepara(textfile *tf, word *text, int indent, int width) {
629 for (; text; text = text->next) if (text->type == word_WeakCode) {
630 if (ustrlen(text->text) > width) {
631 /* FIXME: warn */
632 }
633 text_output_many(tf, indent, L' ');
634 text_output(tf, text->text);
635 text_output(tf, L"\n");
636 }
637
638 text_output(tf, L"\n");
639 }
640
641 static void text_versionid(textfile *tf, word *text) {
642 rdstring t = { 0, 0, NULL };
643
644 rdadd(&t, L'['); /* FIXME: configurability */
645 text_rdaddw(tf->charset, &t, text, NULL);
646 rdadd(&t, L']'); /* FIXME: configurability */
647 rdadd(&t, L'\n');
648
649 text_output(tf, t.text);
650 sfree(t.text);
651 }