Infrastructure changes for character set support. ustrtoa,
[sgt/halibut] / bk_pdf.c
1 /*
2 * PDF backend for Halibut
3 */
4
5 #include <assert.h>
6 #include "halibut.h"
7 #include "paper.h"
8
9 #define TREE_BRANCH 2 /* max branching factor in page tree */
10
11 paragraph *pdf_config_filename(char *filename)
12 {
13 return cmdline_cfg_simple("pdf-filename", filename, NULL);
14 }
15
16 typedef struct object_Tag object;
17 typedef struct objlist_Tag objlist;
18
19 struct object_Tag {
20 objlist *list;
21 object *next;
22 int number;
23 rdstringc main, stream;
24 int size, fileoff;
25 char *final;
26 };
27
28 struct objlist_Tag {
29 int number;
30 object *head, *tail;
31 };
32
33 static object *new_object(objlist *list);
34 static void objtext(object *o, char const *text);
35 static void objstream(object *o, char const *text);
36 static void pdf_string(void (*add)(object *, char const *),
37 object *, char const *);
38 static void objref(object *o, object *dest);
39
40 static void make_pages_node(object *node, object *parent, page_data *first,
41 page_data *last, object *resources);
42 static int make_outline(object *parent, outline_element *start, int n,
43 int open);
44 static int pdf_versionid(FILE *fp, word *words);
45
46 void pdf_backend(paragraph *sourceform, keywordlist *keywords,
47 indexdata *idx, void *vdoc) {
48 document *doc = (document *)vdoc;
49 int font_index;
50 font_encoding *fe;
51 page_data *page;
52 int pageno;
53 FILE *fp;
54 char *filename;
55 paragraph *p;
56 objlist olist;
57 object *o, *cat, *outlines, *pages, *resources;
58 int fileoff;
59
60 IGNORE(keywords);
61 IGNORE(idx);
62
63 filename = dupstr("output.pdf");
64 for (p = sourceform; p; p = p->next) {
65 if (p->type == para_Config && p->parent) {
66 if (!ustricmp(p->keyword, L"pdf-filename")) {
67 sfree(filename);
68 filename = dupstr(adv(p->origkeyword));
69 }
70 }
71 }
72
73 olist.head = olist.tail = NULL;
74 olist.number = 1;
75
76 cat = new_object(&olist);
77 outlines = new_object(&olist);
78 pages = new_object(&olist);
79 resources = new_object(&olist);
80
81 /*
82 * The catalogue just contains references to the outlines and
83 * pages objects.
84 */
85 objtext(cat, "<<\n/Type /Catalog\n/Outlines ");
86 objref(cat, outlines);
87 objtext(cat, "\n/Pages ");
88 objref(cat, pages);
89 objtext(cat, "\n/PageMode /UseOutlines\n>>\n");
90
91 /*
92 * Set up the resources dictionary, which mostly means
93 * providing all the font objects and names to call them by.
94 */
95 font_index = 0;
96 objtext(resources, "<<\n/Font <<\n");
97 for (fe = doc->fonts->head; fe; fe = fe->next) {
98 char fname[40];
99 int i;
100 object *font;
101
102 sprintf(fname, "f%d", font_index++);
103 fe->name = dupstr(fname);
104
105 font = new_object(&olist);
106
107 objtext(resources, "/");
108 objtext(resources, fe->name);
109 objtext(resources, " ");
110 objref(resources, font);
111 objtext(resources, "\n");
112
113 objtext(font, "<<\n/Type /Font\n/Subtype /Type1\n/Name /");
114 objtext(font, fe->name);
115 objtext(font, "\n/BaseFont /");
116 objtext(font, fe->font->name);
117 objtext(font, "\n/Encoding <<\n/Type /Encoding\n/Differences [");
118
119 for (i = 0; i < 256; i++) {
120 char buf[20];
121 if (!fe->vector[i])
122 continue;
123 sprintf(buf, "\n%d /", i);
124 objtext(font, buf);
125 objtext(font, fe->vector[i] ? fe->vector[i] : ".notdef");
126 }
127
128 objtext(font, "\n]\n>>\n");
129
130 {
131 object *widths = new_object(&olist);
132 objtext(font, "/FirstChar 0\n/LastChar 255\n/Widths ");
133 objref(font, widths);
134 objtext(font, "\n");
135 objtext(widths, "[\n");
136 for (i = 0; i < 256; i++) {
137 char buf[80];
138 double width;
139 if (fe->indices[i] < 0)
140 width = 0.0;
141 else
142 width = fe->font->widths[fe->indices[i]];
143 sprintf(buf, "%g\n", 1000.0 * width / 4096.0);
144 objtext(widths, buf);
145 }
146 objtext(widths, "]\n");
147 }
148
149 objtext(font, ">>\n");
150 }
151 objtext(resources, ">>\n>>\n");
152
153 /*
154 * Define the page objects for each page, and get each one
155 * ready to have a `Parent' specification added to it.
156 */
157 for (page = doc->pages; page; page = page->next) {
158 object *opage;
159
160 opage = new_object(&olist);
161 page->spare = opage;
162 objtext(opage, "<<\n/Type /Page\n");
163 }
164
165 /*
166 * Recursively build the page tree.
167 */
168 make_pages_node(pages, NULL, doc->pages, NULL, resources);
169
170 /*
171 * Create and render the individual pages.
172 */
173 pageno = 0;
174 for (page = doc->pages; page; page = page->next) {
175 object *opage, *cstr;
176 rect *r;
177 text_fragment *frag, *frag_end;
178 char buf[256];
179 int x, y, lx, ly;
180
181 opage = (object *)page->spare;
182 /*
183 * At this point the page dictionary is already
184 * half-written, with /Type and /Parent already present. We
185 * continue from there.
186 */
187
188 /*
189 * The PDF spec says /Resources is required, but also says
190 * that it's inheritable and may be omitted if it's present
191 * in a Pages node. In our case it is: it's present in the
192 * topmost /Pages node because we carefully put it there.
193 * So we don't need a /Resources entry here.
194 */
195 sprintf(buf, "/MediaBox [0 0 %g %g]\n",
196 doc->paper_width / 4096.0, doc->paper_height / 4096.0);
197 objtext(opage, buf);
198
199 /*
200 * Now we're ready to define a content stream containing
201 * the actual text on the page.
202 */
203 cstr = new_object(&olist);
204 objtext(opage, "/Contents ");
205 objref(opage, cstr);
206 objtext(opage, "\n");
207
208 /*
209 * Render any rectangles on the page.
210 */
211 for (r = page->first_rect; r; r = r->next) {
212 char buf[512];
213 sprintf(buf, "%g %g %g %g re f\n", r->x / 4096.0,
214 r->y / 4096.0, r->w / 4096.0, r->h / 4096.0);
215 objstream(cstr, buf);
216 }
217
218 objstream(cstr, "BT\n");
219
220 /*
221 * PDF tracks two separate current positions: the position
222 * given in the `line matrix' and the position given in the
223 * `text matrix'. We must therefore track both as well.
224 * They start off at -1 (unset).
225 */
226 lx = ly = -1;
227 x = y = -1;
228
229 frag = page->first_text;
230 while (frag) {
231 /*
232 * For compactness, I'm going to group text fragments
233 * into subsequences that use the same font+size. So
234 * first find the end of this subsequence.
235 */
236 for (frag_end = frag;
237 (frag_end &&
238 frag_end->fe == frag->fe &&
239 frag_end->fontsize == frag->fontsize);
240 frag_end = frag_end->next);
241
242 /*
243 * Now select the text fragment, and prepare to display
244 * the text.
245 */
246 objstream(cstr, "/");
247 objstream(cstr, frag->fe->name);
248 sprintf(buf, " %d Tf ", frag->fontsize);
249 objstream(cstr, buf);
250
251 while (frag && frag != frag_end) {
252 /*
253 * Place the text position for the first piece of
254 * text.
255 */
256 if (lx < 0) {
257 sprintf(buf, "1 0 0 1 %g %g Tm ",
258 frag->x/4096.0, frag->y/4096.0);
259 } else {
260 sprintf(buf, "%g %g Td ",
261 (frag->x - lx)/4096.0, (frag->y - ly)/4096.0);
262 }
263 objstream(cstr, buf);
264 lx = x = frag->x;
265 ly = y = frag->y;
266
267 /*
268 * See if we're going to use Tj (show a single
269 * string) or TJ (show an array of strings with
270 * x-spacings between them). We determine this by
271 * seeing if there's more than one text fragment in
272 * sequence with the same y-coordinate.
273 */
274 if (frag->next && frag->next != frag_end &&
275 frag->next->y == y) {
276 /*
277 * The TJ strategy.
278 */
279 objstream(cstr, "[");
280 while (frag && frag != frag_end && frag->y == y) {
281 if (frag->x != x) {
282 sprintf(buf, "%g",
283 (x - frag->x) * 1000.0 /
284 (4096.0 * frag->fontsize));
285 objstream(cstr, buf);
286 }
287 pdf_string(objstream, cstr, frag->text);
288 x = frag->x + frag->width;
289 frag = frag->next;
290 }
291 objstream(cstr, "]TJ\n");
292 } else
293 {
294 /*
295 * The Tj strategy.
296 */
297 pdf_string(objstream, cstr, frag->text);
298 objstream(cstr, "Tj\n");
299 frag = frag->next;
300 }
301 }
302 }
303 objstream(cstr, "ET");
304
305 /*
306 * Also, we want an annotation dictionary containing the
307 * cross-references from this page.
308 */
309 if (page->first_xref) {
310 xref *xr;
311 objtext(opage, "/Annots [\n");
312
313 for (xr = page->first_xref; xr; xr = xr->next) {
314 object *annot;
315 char buf[256];
316
317 annot = new_object(&olist);
318 objref(opage, annot);
319 objtext(opage, "\n");
320
321 objtext(annot, "<<\n/Type /Annot\n/Subtype /Link\n/Rect [");
322 sprintf(buf, "%g %g %g %g",
323 xr->lx / 4096.0, xr->by / 4096.0,
324 xr->rx / 4096.0, xr->ty / 4096.0);
325 objtext(annot, buf);
326 objtext(annot, "]\n/Border [0 0 0]\n");
327
328 if (xr->dest.type == PAGE) {
329 objtext(annot, "/Dest [");
330 objref(annot, (object *)xr->dest.page->spare);
331 objtext(annot, " /XYZ null null null]\n");
332 } else {
333 objtext(annot, "/A <<\n/Type /Action\n/S /URI\n/URI ");
334 pdf_string(objtext, annot, xr->dest.url);
335 objtext(annot, "\n>>\n");
336 }
337
338 objtext(annot, ">>\n");
339 }
340
341 objtext(opage, "]\n");
342 }
343
344 objtext(opage, ">>\n");
345 }
346
347 /*
348 * Set up the outlines dictionary.
349 */
350 {
351 int topcount;
352 char buf[80];
353
354 objtext(outlines, "<<\n/Type /Outlines\n");
355 topcount = make_outline(outlines, doc->outline_elements,
356 doc->n_outline_elements, TRUE);
357 sprintf(buf, "/Count %d\n>>\n", topcount);
358 objtext(outlines, buf);
359 }
360
361 /*
362 * Assemble the final linear form of every object.
363 */
364 for (o = olist.head; o; o = o->next) {
365 rdstringc rs = {0, 0, NULL};
366 char text[80];
367
368 sprintf(text, "%d 0 obj\n", o->number);
369 rdaddsc(&rs, text);
370
371 if (!o->main.text && o->stream.text) {
372 sprintf(text, "<<\n/Length %d\n>>\n", o->stream.pos);
373 rdaddsc(&o->main, text);
374 }
375
376 assert(o->main.text);
377 rdaddsc(&rs, o->main.text);
378 sfree(o->main.text);
379
380 if (rs.text[rs.pos-1] != '\n')
381 rdaddc(&rs, '\n');
382
383 if (o->stream.text) {
384 /*
385 * FIXME: If we ever start compressing stream data then
386 * it will have zero bytes in it, so we'll have to be
387 * more careful than this.
388 */
389 rdaddsc(&rs, "stream\n");
390 rdaddsc(&rs, o->stream.text);
391 rdaddsc(&rs, "\nendstream\n");
392 sfree(o->stream.text);
393 }
394
395 rdaddsc(&rs, "endobj\n");
396
397 o->final = rs.text;
398 o->size = rs.pos;
399 }
400
401 /*
402 * Write out the PDF file.
403 */
404
405 fp = fopen(filename, "wb");
406 if (!fp) {
407 error(err_cantopenw, filename);
408 return;
409 }
410
411 /*
412 * Header. I'm going to put the version IDs in the header as
413 * well, simply in PDF comments.
414 */
415 fileoff = fprintf(fp, "%%PDF-1.3\n");
416 for (p = sourceform; p; p = p->next)
417 if (p->type == para_VersionID)
418 fileoff += pdf_versionid(fp, p->words);
419
420 /*
421 * Body
422 */
423 for (o = olist.head; o; o = o->next) {
424 o->fileoff = fileoff;
425 fwrite(o->final, 1, o->size, fp);
426 fileoff += o->size;
427 }
428
429 /*
430 * Cross-reference table
431 */
432 fprintf(fp, "xref\n");
433 assert(olist.head->number == 1);
434 fprintf(fp, "0 %d\n", olist.tail->number + 1);
435 fprintf(fp, "0000000000 65535 f \n");
436 for (o = olist.head; o; o = o->next) {
437 char entry[40];
438 sprintf(entry, "%010d 00000 n \n", o->fileoff);
439 assert(strlen(entry) == 20);
440 fputs(entry, fp);
441 }
442
443 /*
444 * Trailer
445 */
446 fprintf(fp, "trailer\n<<\n/Size %d\n/Root %d 0 R\n>>\n",
447 olist.tail->number + 1, cat->number);
448 fprintf(fp, "startxref\n%d\n%%%%EOF\n", fileoff);
449
450 fclose(fp);
451
452 sfree(filename);
453 }
454
455 static object *new_object(objlist *list)
456 {
457 object *obj = mknew(object);
458
459 obj->list = list;
460
461 obj->main.text = NULL;
462 obj->main.pos = obj->main.size = 0;
463 obj->stream.text = NULL;
464 obj->stream.pos = obj->stream.size = 0;
465
466 obj->number = list->number++;
467
468 obj->next = NULL;
469 if (list->tail)
470 list->tail->next = obj;
471 else
472 list->head = obj;
473 list->tail = obj;
474
475 obj->size = 0;
476 obj->final = NULL;
477
478 return obj;
479 }
480
481 static void objtext(object *o, char const *text)
482 {
483 rdaddsc(&o->main, text);
484 }
485
486 static void objstream(object *o, char const *text)
487 {
488 rdaddsc(&o->stream, text);
489 }
490
491 static void objref(object *o, object *dest)
492 {
493 char buf[40];
494 sprintf(buf, "%d 0 R", dest->number);
495 rdaddsc(&o->main, buf);
496 }
497
498 static void make_pages_node(object *node, object *parent, page_data *first,
499 page_data *last, object *resources)
500 {
501 int count;
502 page_data *page;
503 char buf[80];
504
505 objtext(node, "<<\n/Type /Pages\n");
506 if (parent) {
507 objtext(node, "/Parent ");
508 objref(node, parent);
509 objtext(node, "\n");
510 }
511
512 /*
513 * Count the pages in this stretch, to see if there are few
514 * enough to reference directly.
515 */
516 count = 0;
517 for (page = first; page; page = page->next) {
518 count++;
519 if (page == last)
520 break;
521 }
522
523 sprintf(buf, "/Count %d\n/Kids [\n", count);
524 objtext(node, buf);
525
526 if (count > TREE_BRANCH) {
527 int i;
528 page_data *thisfirst, *thislast;
529
530 page = first;
531
532 for (i = 0; i < TREE_BRANCH; i++) {
533 int number = (i+1) * count / TREE_BRANCH - i * count / TREE_BRANCH;
534 thisfirst = page;
535 while (number--) {
536 thislast = page;
537 page = page->next;
538 }
539
540 if (thisfirst == thislast) {
541 objref(node, (object *)thisfirst->spare);
542 objtext((object *)thisfirst->spare, "/Parent ");
543 objref((object *)thisfirst->spare, node);
544 objtext((object *)thisfirst->spare, "\n");
545 } else {
546 object *newnode = new_object(node->list);
547 make_pages_node(newnode, node, thisfirst, thislast, NULL);
548 objref(node, newnode);
549 }
550 objtext(node, "\n");
551 }
552
553 assert(thislast == last || page == NULL);
554
555 } else {
556 for (page = first; page; page = page->next) {
557 objref(node, (object *)page->spare);
558 objtext(node, "\n");
559 objtext((object *)page->spare, "/Parent ");
560 objref((object *)page->spare, node);
561 objtext((object *)page->spare, "\n");
562 if (page == last)
563 break;
564 }
565 }
566
567 objtext(node, "]\n");
568
569 if (resources) {
570 objtext(node, "/Resources ");
571 objref(node, resources);
572 objtext(node, "\n");
573 }
574
575 objtext(node, ">>\n");
576 }
577
578 /*
579 * In text on the page, PDF uses the PostScript font model, which
580 * means that glyphs are identified by PS strings and hence font
581 * encoding can be managed independently of the supplied encoding
582 * of the font. However, in the document outline, the PDF spec
583 * simply asks for ordinary text strings without mentioning what
584 * character set they are supposed to be interpreted in.
585 *
586 * Therefore, for the moment, I'm going to assume they're US-ASCII
587 * only. If anyone knows better, they should let me know :-/
588 */
589 static int pdf_convert(wchar_t *s, char **result) {
590 int doing = (result != 0);
591 int ok = TRUE;
592 char *p = NULL;
593 int plen = 0, psize = 0;
594
595 for (; *s; s++) {
596 wchar_t c = *s;
597 char outc;
598
599 if (c >= 32 && c <= 126) {
600 /* Char is OK. */
601 outc = (char)c;
602 } else {
603 /* Char is not OK. */
604 ok = FALSE;
605 outc = 0xBF; /* approximate the good old DEC `uh?' */
606 }
607 if (doing) {
608 if (plen >= psize) {
609 psize = plen + 256;
610 p = resize(p, psize);
611 }
612 p[plen++] = outc;
613 }
614 }
615 if (doing) {
616 p = resize(p, plen+1);
617 p[plen] = '\0';
618 *result = p;
619 }
620 return ok;
621 }
622
623 static int make_outline(object *parent, outline_element *items, int n,
624 int open)
625 {
626 int level, totalcount = 0;
627 outline_element *itemp;
628 object *curr, *prev = NULL, *first = NULL, *last = NULL;
629
630 assert(n > 0);
631
632 level = items->level;
633
634 while (n > 0) {
635 char *title;
636
637 /*
638 * Here we expect to be sitting on an item at the given
639 * level. So we start by constructing an outline entry for
640 * that item.
641 */
642 assert(items->level == level);
643
644 pdf_convert(items->pdata->outline_title, &title);
645
646 totalcount++;
647 curr = new_object(parent->list);
648 if (!first) first = curr;
649 last = curr;
650 objtext(curr, "<<\n/Title ");
651 pdf_string(objtext, curr, title);
652 objtext(curr, "\n/Parent ");
653 objref(curr, parent);
654 objtext(curr, "\n/Dest [");
655 objref(curr, (object *)items->pdata->first->page->spare);
656 objtext(curr, " /XYZ null null null]\n");
657 if (prev) {
658 objtext(curr, "/Prev ");
659 objref(curr, prev);
660 objtext(curr, "\n");
661
662 objtext(prev, "/Next ");
663 objref(prev, curr);
664 objtext(prev, "\n>>\n");
665 }
666 prev = curr;
667
668 items++, n--;
669 for (itemp = items; itemp < items+n && itemp->level > level;
670 itemp++);
671
672 if (itemp > items) {
673 char buf[80];
674 int count = make_outline(curr, items, itemp - items, FALSE);
675 if (!open)
676 count = -count;
677 else
678 totalcount += count;
679 sprintf(buf, "/Count %d\n", count);
680 objtext(curr, buf);
681 }
682
683 n -= itemp - items;
684 items = itemp;
685 }
686 objtext(prev, ">>\n");
687
688 assert(first && last);
689 objtext(parent, "/First ");
690 objref(parent, first);
691 objtext(parent, "\n/Last ");
692 objref(parent, last);
693 objtext(parent, "\n");
694
695 return totalcount;
696 }
697
698 static int pdf_versionid(FILE *fp, word *words)
699 {
700 int ret;
701
702 ret = fprintf(fp, "%% ");
703
704 for (; words; words = words->next) {
705 char *text;
706 int type;
707
708 switch (words->type) {
709 case word_HyperLink:
710 case word_HyperEnd:
711 case word_UpperXref:
712 case word_LowerXref:
713 case word_XrefEnd:
714 case word_IndexRef:
715 continue;
716 }
717
718 type = removeattr(words->type);
719
720 switch (type) {
721 case word_Normal:
722 text = utoa_dup(words->text, CS_ASCII);
723 break;
724 case word_WhiteSpace:
725 text = dupstr(" ");
726 break;
727 case word_Quote:
728 text = dupstr("'");
729 break;
730 }
731
732 fputs(text, fp);
733 ret += strlen(text);
734 sfree(text);
735 }
736
737 ret += fprintf(fp, "\n");
738
739 return ret;
740 }
741
742 static void pdf_string(void (*add)(object *, char const *),
743 object *o, char const *str)
744 {
745 char const *p;
746
747 add(o, "(");
748 for (p = str; *p; p++) {
749 char c[2];
750 if (*p == '\\' || *p == '(' || *p == ')')
751 add(o, "\\");
752 c[0] = *p;
753 c[1] = '\0';
754 add(o, c);
755 }
756 add(o, ")");
757 }