2 * PDF backend for Halibut
9 #define TREE_BRANCH 2 /* max branching factor in page tree */
11 paragraph
*pdf_config_filename(char *filename
)
14 wchar_t *ufilename
, *up
;
18 memset(p
, 0, sizeof(*p
));
19 p
->type
= para_Config
;
21 p
->fpos
.filename
= "<command line>";
22 p
->fpos
.line
= p
->fpos
.col
= -1;
24 ufilename
= ufroma_dup(filename
);
25 len
= ustrlen(ufilename
) + 2 + lenof(L
"pdf-filename");
26 p
->keyword
= mknewa(wchar_t, len
);
28 ustrcpy(up
, L
"pdf-filename");
30 ustrcpy(up
, ufilename
);
33 assert(up
- p
->keyword
< len
);
39 typedef struct object_Tag object
;
40 typedef struct objlist_Tag objlist
;
46 rdstringc main
, stream
;
56 static object
*new_object(objlist
*list
);
57 static void objtext(object
*o
, char const *text
);
58 static void objstream(object
*o
, char const *text
);
59 static void pdf_string(void (*add
)(object
*, char const *),
60 object
*, char const *);
61 static void objref(object
*o
, object
*dest
);
63 static void make_pages_node(object
*node
, object
*parent
, page_data
*first
,
64 page_data
*last
, object
*resources
);
65 static int make_outline(object
*parent
, outline_element
*start
, int n
,
67 static int pdf_versionid(FILE *fp
, word
*words
);
69 void pdf_backend(paragraph
*sourceform
, keywordlist
*keywords
,
70 indexdata
*idx
, void *vdoc
) {
71 document
*doc
= (document
*)vdoc
;
80 object
*o
, *cat
, *outlines
, *pages
, *resources
;
86 filename
= dupstr("output.pdf");
87 for (p
= sourceform
; p
; p
= p
->next
) {
88 if (p
->type
== para_Config
&& p
->parent
) {
89 if (!ustricmp(p
->keyword
, L
"pdf-filename")) {
91 filename
= utoa_dup(uadv(p
->keyword
));
96 olist
.head
= olist
.tail
= NULL
;
99 cat
= new_object(&olist
);
100 outlines
= new_object(&olist
);
101 pages
= new_object(&olist
);
102 resources
= new_object(&olist
);
105 * The catalogue just contains references to the outlines and
108 objtext(cat
, "<<\n/Type /Catalog\n/Outlines ");
109 objref(cat
, outlines
);
110 objtext(cat
, "\n/Pages ");
112 objtext(cat
, "\n/PageMode /UseOutlines\n>>\n");
115 * Set up the resources dictionary, which mostly means
116 * providing all the font objects and names to call them by.
119 objtext(resources
, "<<\n/Font <<\n");
120 for (fe
= doc
->fonts
->head
; fe
; fe
= fe
->next
) {
125 sprintf(fname
, "f%d", font_index
++);
126 fe
->name
= dupstr(fname
);
128 font
= new_object(&olist
);
130 objtext(resources
, "/");
131 objtext(resources
, fe
->name
);
132 objtext(resources
, " ");
133 objref(resources
, font
);
134 objtext(resources
, "\n");
136 objtext(font
, "<<\n/Type /Font\n/Subtype /Type1\n/Name /");
137 objtext(font
, fe
->name
);
138 objtext(font
, "\n/BaseFont /");
139 objtext(font
, fe
->font
->name
);
140 objtext(font
, "\n/Encoding <<\n/Type /Encoding\n/Differences [");
142 for (i
= 0; i
< 256; i
++) {
146 sprintf(buf
, "\n%d /", i
);
148 objtext(font
, fe
->vector
[i
] ? fe
->vector
[i
] : ".notdef");
151 objtext(font
, "\n]\n>>\n");
154 object
*widths
= new_object(&olist
);
155 objtext(font
, "/FirstChar 0\n/LastChar 255\n/Widths ");
156 objref(font
, widths
);
158 objtext(widths
, "[\n");
159 for (i
= 0; i
< 256; i
++) {
162 if (fe
->indices
[i
] < 0)
165 width
= fe
->font
->widths
[fe
->indices
[i
]];
166 sprintf(buf
, "%g\n", 1000.0 * width
/ 4096.0);
167 objtext(widths
, buf
);
169 objtext(widths
, "]\n");
172 objtext(font
, ">>\n");
174 objtext(resources
, ">>\n>>\n");
177 * Define the page objects for each page, and get each one
178 * ready to have a `Parent' specification added to it.
180 for (page
= doc
->pages
; page
; page
= page
->next
) {
183 opage
= new_object(&olist
);
185 objtext(opage
, "<<\n/Type /Page\n");
189 * Recursively build the page tree.
191 make_pages_node(pages
, NULL
, doc
->pages
, NULL
, resources
);
194 * Create and render the individual pages.
197 for (page
= doc
->pages
; page
; page
= page
->next
) {
198 object
*opage
, *cstr
;
200 text_fragment
*frag
, *frag_end
;
204 opage
= (object
*)page
->spare
;
206 * At this point the page dictionary is already
207 * half-written, with /Type and /Parent already present. We
208 * continue from there.
212 * The PDF spec says /Resources is required, but also says
213 * that it's inheritable and may be omitted if it's present
214 * in a Pages node. In our case it is: it's present in the
215 * topmost /Pages node because we carefully put it there.
216 * So we don't need a /Resources entry here.
218 sprintf(buf
, "/MediaBox [0 0 %g %g]\n",
219 doc
->paper_width
/ 4096.0, doc
->paper_height
/ 4096.0);
223 * Now we're ready to define a content stream containing
224 * the actual text on the page.
226 cstr
= new_object(&olist
);
227 objtext(opage
, "/Contents ");
229 objtext(opage
, "\n");
232 * Render any rectangles on the page.
234 for (r
= page
->first_rect
; r
; r
= r
->next
) {
236 sprintf(buf
, "%g %g %g %g re f\n", r
->x
/ 4096.0,
237 r
->y
/ 4096.0, r
->w
/ 4096.0, r
->h
/ 4096.0);
238 objstream(cstr
, buf
);
241 objstream(cstr
, "BT\n");
244 * PDF tracks two separate current positions: the position
245 * given in the `line matrix' and the position given in the
246 * `text matrix'. We must therefore track both as well.
247 * They start off at -1 (unset).
252 frag
= page
->first_text
;
255 * For compactness, I'm going to group text fragments
256 * into subsequences that use the same font+size. So
257 * first find the end of this subsequence.
259 for (frag_end
= frag
;
261 frag_end
->fe
== frag
->fe
&&
262 frag_end
->fontsize
== frag
->fontsize
);
263 frag_end
= frag_end
->next
);
266 * Now select the text fragment, and prepare to display
269 objstream(cstr
, "/");
270 objstream(cstr
, frag
->fe
->name
);
271 sprintf(buf
, " %d Tf ", frag
->fontsize
);
272 objstream(cstr
, buf
);
274 while (frag
&& frag
!= frag_end
) {
276 * Place the text position for the first piece of
280 sprintf(buf
, "1 0 0 1 %g %g Tm ",
281 frag
->x
/4096.0, frag
->y
/4096.0);
283 sprintf(buf
, "%g %g Td ",
284 (frag
->x
- lx
)/4096.0, (frag
->y
- ly
)/4096.0);
286 objstream(cstr
, buf
);
291 * See if we're going to use Tj (show a single
292 * string) or TJ (show an array of strings with
293 * x-spacings between them). We determine this by
294 * seeing if there's more than one text fragment in
295 * sequence with the same y-coordinate.
297 if (frag
->next
&& frag
->next
!= frag_end
&&
298 frag
->next
->y
== y
) {
302 objstream(cstr
, "[");
303 while (frag
&& frag
!= frag_end
&& frag
->y
== y
) {
306 (x
- frag
->x
) * 1000.0 /
307 (4096.0 * frag
->fontsize
));
308 objstream(cstr
, buf
);
310 pdf_string(objstream
, cstr
, frag
->text
);
311 x
= frag
->x
+ frag
->width
;
314 objstream(cstr
, "]TJ\n");
320 pdf_string(objstream
, cstr
, frag
->text
);
321 objstream(cstr
, "Tj\n");
326 objstream(cstr
, "ET");
329 * Also, we want an annotation dictionary containing the
330 * cross-references from this page.
332 if (page
->first_xref
) {
334 objtext(opage
, "/Annots [\n");
336 for (xr
= page
->first_xref
; xr
; xr
= xr
->next
) {
340 annot
= new_object(&olist
);
341 objref(opage
, annot
);
342 objtext(opage
, "\n");
344 objtext(annot
, "<<\n/Type /Annot\n/Subtype /Link\n/Rect [");
345 sprintf(buf
, "%g %g %g %g",
346 xr
->lx
/ 4096.0, xr
->by
/ 4096.0,
347 xr
->rx
/ 4096.0, xr
->ty
/ 4096.0);
349 objtext(annot
, "]\n/Border [0 0 0]\n");
351 if (xr
->dest
.type
== PAGE
) {
352 objtext(annot
, "/Dest [");
353 objref(annot
, (object
*)xr
->dest
.page
->spare
);
354 objtext(annot
, " /XYZ null null null]\n");
356 objtext(annot
, "/A <<\n/Type /Action\n/S /URI\n/URI ");
357 pdf_string(objtext
, annot
, xr
->dest
.url
);
358 objtext(annot
, "\n>>\n");
361 objtext(annot
, ">>\n");
364 objtext(opage
, "]\n");
367 objtext(opage
, ">>\n");
371 * Set up the outlines dictionary.
377 objtext(outlines
, "<<\n/Type /Outlines\n");
378 topcount
= make_outline(outlines
, doc
->outline_elements
,
379 doc
->n_outline_elements
, TRUE
);
380 sprintf(buf
, "/Count %d\n>>\n", topcount
);
381 objtext(outlines
, buf
);
385 * Assemble the final linear form of every object.
387 for (o
= olist
.head
; o
; o
= o
->next
) {
388 rdstringc rs
= {0, 0, NULL
};
391 sprintf(text
, "%d 0 obj\n", o
->number
);
394 if (!o
->main
.text
&& o
->stream
.text
) {
395 sprintf(text
, "<<\n/Length %d\n>>\n", o
->stream
.pos
);
396 rdaddsc(&o
->main
, text
);
399 assert(o
->main
.text
);
400 rdaddsc(&rs
, o
->main
.text
);
403 if (rs
.text
[rs
.pos
-1] != '\n')
406 if (o
->stream
.text
) {
408 * FIXME: If we ever start compressing stream data then
409 * it will have zero bytes in it, so we'll have to be
410 * more careful than this.
412 rdaddsc(&rs
, "stream\n");
413 rdaddsc(&rs
, o
->stream
.text
);
414 rdaddsc(&rs
, "\nendstream\n");
415 sfree(o
->stream
.text
);
418 rdaddsc(&rs
, "endobj\n");
425 * Write out the PDF file.
428 fp
= fopen(filename
, "wb");
430 error(err_cantopenw
, filename
);
435 * Header. I'm going to put the version IDs in the header as
436 * well, simply in PDF comments.
438 fileoff
= fprintf(fp
, "%%PDF-1.3\n");
439 for (p
= sourceform
; p
; p
= p
->next
)
440 if (p
->type
== para_VersionID
)
441 fileoff
+= pdf_versionid(fp
, p
->words
);
446 for (o
= olist
.head
; o
; o
= o
->next
) {
447 o
->fileoff
= fileoff
;
448 fwrite(o
->final
, 1, o
->size
, fp
);
453 * Cross-reference table
455 fprintf(fp
, "xref\n");
456 assert(olist
.head
->number
== 1);
457 fprintf(fp
, "0 %d\n", olist
.tail
->number
+ 1);
458 fprintf(fp
, "0000000000 65535 f \n");
459 for (o
= olist
.head
; o
; o
= o
->next
) {
461 sprintf(entry
, "%010d 00000 n \n", o
->fileoff
);
462 assert(strlen(entry
) == 20);
469 fprintf(fp
, "trailer\n<<\n/Size %d\n/Root %d 0 R\n>>\n",
470 olist
.tail
->number
+ 1, cat
->number
);
471 fprintf(fp
, "startxref\n%d\n%%%%EOF\n", fileoff
);
478 static object
*new_object(objlist
*list
)
480 object
*obj
= mknew(object
);
484 obj
->main
.text
= NULL
;
485 obj
->main
.pos
= obj
->main
.size
= 0;
486 obj
->stream
.text
= NULL
;
487 obj
->stream
.pos
= obj
->stream
.size
= 0;
489 obj
->number
= list
->number
++;
493 list
->tail
->next
= obj
;
504 static void objtext(object
*o
, char const *text
)
506 rdaddsc(&o
->main
, text
);
509 static void objstream(object
*o
, char const *text
)
511 rdaddsc(&o
->stream
, text
);
514 static void objref(object
*o
, object
*dest
)
517 sprintf(buf
, "%d 0 R", dest
->number
);
518 rdaddsc(&o
->main
, buf
);
521 static void make_pages_node(object
*node
, object
*parent
, page_data
*first
,
522 page_data
*last
, object
*resources
)
528 objtext(node
, "<<\n/Type /Pages\n");
530 objtext(node
, "/Parent ");
531 objref(node
, parent
);
536 * Count the pages in this stretch, to see if there are few
537 * enough to reference directly.
540 for (page
= first
; page
; page
= page
->next
) {
546 sprintf(buf
, "/Count %d\n/Kids [\n", count
);
549 if (count
> TREE_BRANCH
) {
551 page_data
*thisfirst
, *thislast
;
555 for (i
= 0; i
< TREE_BRANCH
; i
++) {
556 int number
= (i
+1) * count
/ TREE_BRANCH
- i
* count
/ TREE_BRANCH
;
563 if (thisfirst
== thislast
) {
564 objref(node
, (object
*)thisfirst
->spare
);
565 objtext((object
*)thisfirst
->spare
, "/Parent ");
566 objref((object
*)thisfirst
->spare
, node
);
567 objtext((object
*)thisfirst
->spare
, "\n");
569 object
*newnode
= new_object(node
->list
);
570 make_pages_node(newnode
, node
, thisfirst
, thislast
, NULL
);
571 objref(node
, newnode
);
576 assert(thislast
== last
|| page
== NULL
);
579 for (page
= first
; page
; page
= page
->next
) {
580 objref(node
, (object
*)page
->spare
);
582 objtext((object
*)page
->spare
, "/Parent ");
583 objref((object
*)page
->spare
, node
);
584 objtext((object
*)page
->spare
, "\n");
590 objtext(node
, "]\n");
593 objtext(node
, "/Resources ");
594 objref(node
, resources
);
598 objtext(node
, ">>\n");
602 * In text on the page, PDF uses the PostScript font model, which
603 * means that glyphs are identified by PS strings and hence font
604 * encoding can be managed independently of the supplied encoding
605 * of the font. However, in the document outline, the PDF spec
606 * simply asks for ordinary text strings without mentioning what
607 * character set they are supposed to be interpreted in.
609 * Therefore, for the moment, I'm going to assume they're US-ASCII
610 * only. If anyone knows better, they should let me know :-/
612 static int pdf_convert(wchar_t *s
, char **result
) {
613 int doing
= (result
!= 0);
616 int plen
= 0, psize
= 0;
622 if (c
>= 32 && c
<= 126) {
626 /* Char is not OK. */
628 outc
= 0xBF; /* approximate the good old DEC `uh?' */
633 p
= resize(p
, psize
);
639 p
= resize(p
, plen
+1);
646 static int make_outline(object
*parent
, outline_element
*items
, int n
,
649 int level
, totalcount
= 0;
650 outline_element
*itemp
;
651 object
*curr
, *prev
= NULL
, *first
= NULL
, *last
= NULL
;
655 level
= items
->level
;
661 * Here we expect to be sitting on an item at the given
662 * level. So we start by constructing an outline entry for
665 assert(items
->level
== level
);
667 pdf_convert(items
->pdata
->outline_title
, &title
);
670 curr
= new_object(parent
->list
);
671 if (!first
) first
= curr
;
673 objtext(curr
, "<<\n/Title ");
674 pdf_string(objtext
, curr
, title
);
675 objtext(curr
, "\n/Parent ");
676 objref(curr
, parent
);
677 objtext(curr
, "\n/Dest [");
678 objref(curr
, (object
*)items
->pdata
->first
->page
->spare
);
679 objtext(curr
, " /XYZ null null null]\n");
681 objtext(curr
, "/Prev ");
685 objtext(prev
, "/Next ");
687 objtext(prev
, "\n>>\n");
692 for (itemp
= items
; itemp
< items
+n
&& itemp
->level
> level
;
697 int count
= make_outline(curr
, items
, itemp
- items
, FALSE
);
702 sprintf(buf
, "/Count %d\n", count
);
709 objtext(prev
, ">>\n");
711 assert(first
&& last
);
712 objtext(parent
, "/First ");
713 objref(parent
, first
);
714 objtext(parent
, "\n/Last ");
715 objref(parent
, last
);
716 objtext(parent
, "\n");
721 static int pdf_versionid(FILE *fp
, word
*words
)
725 ret
= fprintf(fp
, "%% ");
727 for (; words
; words
= words
->next
) {
731 switch (words
->type
) {
741 type
= removeattr(words
->type
);
745 text
= utoa_dup(words
->text
);
747 case word_WhiteSpace
:
760 ret
+= fprintf(fp
, "\n");
765 static void pdf_string(void (*add
)(object
*, char const *),
766 object
*o
, char const *str
)
771 for (p
= str
; *p
; p
++) {
773 if (*p
== '\\' || *p
== '(' || *p
== ')')