2 * PDF backend for Halibut
9 #define TREE_BRANCH 2 /* max branching factor in page tree */
11 paragraph
*pdf_config_filename(char *filename
)
13 return cmdline_cfg_simple("pdf-filename", filename
, NULL
);
16 typedef struct object_Tag object
;
17 typedef struct objlist_Tag objlist
;
23 rdstringc main
, stream
;
33 static object
*new_object(objlist
*list
);
34 static void objtext(object
*o
, char const *text
);
35 static void objstream(object
*o
, char const *text
);
36 static void pdf_string(void (*add
)(object
*, char const *),
37 object
*, char const *);
38 static void pdf_string_len(void (*add
)(object
*, char const *),
39 object
*, char const *, int);
40 static void objref(object
*o
, object
*dest
);
41 static char *pdf_outline_convert(wchar_t *s
, int *len
);
43 static int is_std_font(char const *name
);
45 static void make_pages_node(object
*node
, object
*parent
, page_data
*first
,
46 page_data
*last
, object
*resources
,
48 static int make_outline(object
*parent
, outline_element
*start
, int n
,
50 static int pdf_versionid(FILE *fp
, word
*words
);
52 void pdf_backend(paragraph
*sourceform
, keywordlist
*keywords
,
53 indexdata
*idx
, void *vdoc
) {
54 document
*doc
= (document
*)vdoc
;
63 object
*o
, *info
, *cat
, *outlines
, *pages
, *resources
, *mediabox
;
69 filename
= dupstr("output.pdf");
70 for (p
= sourceform
; p
; p
= p
->next
) {
71 if (p
->type
== para_Config
) {
72 if (!ustricmp(p
->keyword
, L
"pdf-filename")) {
74 filename
= dupstr(adv(p
->origkeyword
));
79 olist
.head
= olist
.tail
= NULL
;
85 info
= new_object(&olist
);
86 objtext(info
, "<<\n");
87 if (doc
->n_outline_elements
> 0) {
92 pdf_outline_convert(doc
->outline_elements
->pdata
->outline_title
,
94 objtext(info
, "/Title ");
95 pdf_string_len(objtext
, info
, title
, titlelen
);
99 objtext(info
, "/Producer ");
100 sprintf(buf
, "Halibut, %s", version
);
101 pdf_string(objtext
, info
, buf
);
102 objtext(info
, "\n>>\n");
105 cat
= new_object(&olist
);
106 if (doc
->n_outline_elements
> 0)
107 outlines
= new_object(&olist
);
110 pages
= new_object(&olist
);
111 resources
= new_object(&olist
);
114 * The catalogue just contains references to the outlines and
117 objtext(cat
, "<<\n/Type /Catalog");
119 objtext(cat
, "\n/Outlines ");
120 objref(cat
, outlines
);
122 objtext(cat
, "\n/Pages ");
125 objtext(cat
, "\n/PageMode /UseOutlines");
126 objtext(cat
, "\n>>\n");
129 * Set up the resources dictionary, which mostly means
130 * providing all the font objects and names to call them by.
133 objtext(resources
, "<<\n/Font <<\n");
134 for (fe
= doc
->fonts
->head
; fe
; fe
= fe
->next
) {
139 sprintf(fname
, "f%d", font_index
++);
140 fe
->name
= dupstr(fname
);
142 font
= new_object(&olist
);
144 objtext(resources
, "/");
145 objtext(resources
, fe
->name
);
146 objtext(resources
, " ");
147 objref(resources
, font
);
148 objtext(resources
, "\n");
150 objtext(font
, "<<\n/Type /Font\n/Subtype /Type1\n/Name /");
151 objtext(font
, fe
->name
);
152 objtext(font
, "\n/BaseFont /");
153 objtext(font
, fe
->font
->info
->name
);
154 objtext(font
, "\n/Encoding <<\n/Type /Encoding\n/Differences [");
156 for (i
= 0; i
< 256; i
++) {
160 sprintf(buf
, "\n%d /", i
);
162 objtext(font
, fe
->vector
[i
] ? fe
->vector
[i
] : ".notdef");
165 objtext(font
, "\n]\n>>\n");
167 if (!is_std_font(fe
->font
->info
->name
)){
168 object
*widths
= new_object(&olist
);
169 int firstchar
= -1, lastchar
= -1;
171 for (i
= 0; i
< 256; i
++)
172 if (fe
->indices
[i
] >= 0) {
173 if (firstchar
< 0) firstchar
= i
;
176 sprintf(buf
, "/FirstChar %d\n/LastChar %d\n/Widths ",
177 firstchar
, lastchar
);
179 objref(font
, widths
);
181 objtext(widths
, "[\n");
182 for (i
= firstchar
; i
<= lastchar
; i
++) {
185 if (fe
->indices
[i
] < 0)
188 width
= fe
->font
->info
->widths
[fe
->indices
[i
]];
189 sprintf(buf
, "%g\n", 1000.0 * width
/ FUNITS_PER_PT
);
190 objtext(widths
, buf
);
192 objtext(widths
, "]\n");
195 objtext(font
, ">>\n");
197 objtext(resources
, ">>\n>>\n");
201 mediabox
= new_object(&olist
);
202 sprintf(buf
, "[0 0 %g %g]\n",
203 doc
->paper_width
/ FUNITS_PER_PT
,
204 doc
->paper_height
/ FUNITS_PER_PT
);
205 objtext(mediabox
, buf
);
209 * Define the page objects for each page, and get each one
210 * ready to have a `Parent' specification added to it.
212 for (page
= doc
->pages
; page
; page
= page
->next
) {
215 opage
= new_object(&olist
);
217 objtext(opage
, "<<\n/Type /Page\n");
221 * Recursively build the page tree.
223 make_pages_node(pages
, NULL
, doc
->pages
, NULL
, resources
, mediabox
);
226 * Create and render the individual pages.
229 for (page
= doc
->pages
; page
; page
= page
->next
) {
230 object
*opage
, *cstr
;
232 text_fragment
*frag
, *frag_end
;
236 opage
= (object
*)page
->spare
;
238 * At this point the page dictionary is already
239 * half-written, with /Type and /Parent already present. We
240 * continue from there.
244 * The PDF spec says /Resources is required, but also says
245 * that it's inheritable and may be omitted if it's present
246 * in a Pages node. In our case it is: it's present in the
247 * topmost /Pages node because we carefully put it there.
248 * So we don't need a /Resources entry here. The same applies
253 * Now we're ready to define a content stream containing
254 * the actual text on the page.
256 cstr
= new_object(&olist
);
257 objtext(opage
, "/Contents ");
259 objtext(opage
, "\n");
262 * Render any rectangles on the page.
264 for (r
= page
->first_rect
; r
; r
= r
->next
) {
266 sprintf(buf
, "%g %g %g %g re f\n",
267 r
->x
/ FUNITS_PER_PT
, r
->y
/ FUNITS_PER_PT
,
268 r
->w
/ FUNITS_PER_PT
, r
->h
/ FUNITS_PER_PT
);
269 objstream(cstr
, buf
);
272 objstream(cstr
, "BT\n");
275 * PDF tracks two separate current positions: the position
276 * given in the `line matrix' and the position given in the
277 * `text matrix'. We must therefore track both as well.
278 * They start off at -1 (unset).
283 frag
= page
->first_text
;
286 * For compactness, I'm going to group text fragments
287 * into subsequences that use the same font+size. So
288 * first find the end of this subsequence.
290 for (frag_end
= frag
;
292 frag_end
->fe
== frag
->fe
&&
293 frag_end
->fontsize
== frag
->fontsize
);
294 frag_end
= frag_end
->next
);
297 * Now select the text fragment, and prepare to display
300 objstream(cstr
, "/");
301 objstream(cstr
, frag
->fe
->name
);
302 sprintf(buf
, " %d Tf ", frag
->fontsize
);
303 objstream(cstr
, buf
);
305 while (frag
&& frag
!= frag_end
) {
307 * Place the text position for the first piece of
311 sprintf(buf
, "1 0 0 1 %g %g Tm ",
312 frag
->x
/FUNITS_PER_PT
, frag
->y
/FUNITS_PER_PT
);
314 sprintf(buf
, "%g %g Td ",
315 (frag
->x
- lx
)/FUNITS_PER_PT
,
316 (frag
->y
- ly
)/FUNITS_PER_PT
);
318 objstream(cstr
, buf
);
323 * See if we're going to use Tj (show a single
324 * string) or TJ (show an array of strings with
325 * x-spacings between them). We determine this by
326 * seeing if there's more than one text fragment in
327 * sequence with the same y-coordinate.
329 if (frag
->next
&& frag
->next
!= frag_end
&&
330 frag
->next
->y
== y
) {
334 objstream(cstr
, "[");
335 while (frag
&& frag
!= frag_end
&& frag
->y
== y
) {
338 (x
- frag
->x
) * 1000.0 /
339 (FUNITS_PER_PT
* frag
->fontsize
));
340 objstream(cstr
, buf
);
342 pdf_string(objstream
, cstr
, frag
->text
);
343 x
= frag
->x
+ frag
->width
;
346 objstream(cstr
, "]TJ\n");
352 pdf_string(objstream
, cstr
, frag
->text
);
353 objstream(cstr
, "Tj\n");
358 objstream(cstr
, "ET");
361 * Also, we want an annotation dictionary containing the
362 * cross-references from this page.
364 if (page
->first_xref
) {
366 objtext(opage
, "/Annots [\n");
368 for (xr
= page
->first_xref
; xr
; xr
= xr
->next
) {
372 annot
= new_object(&olist
);
373 objref(opage
, annot
);
374 objtext(opage
, "\n");
376 objtext(annot
, "<<\n/Type /Annot\n/Subtype /Link\n/Rect [");
377 sprintf(buf
, "%g %g %g %g",
378 xr
->lx
/ FUNITS_PER_PT
, xr
->by
/ FUNITS_PER_PT
,
379 xr
->rx
/ FUNITS_PER_PT
, xr
->ty
/ FUNITS_PER_PT
);
381 objtext(annot
, "]\n/Border [0 0 0]\n");
383 if (xr
->dest
.type
== PAGE
) {
384 objtext(annot
, "/Dest [");
385 objref(annot
, (object
*)xr
->dest
.page
->spare
);
386 objtext(annot
, " /XYZ null null null]\n");
388 objtext(annot
, "/A <<\n/Type /Action\n/S /URI\n/URI ");
389 pdf_string(objtext
, annot
, xr
->dest
.url
);
390 objtext(annot
, "\n>>\n");
393 objtext(annot
, ">>\n");
396 objtext(opage
, "]\n");
399 objtext(opage
, ">>\n");
403 * Set up the outlines dictionary.
409 objtext(outlines
, "<<\n/Type /Outlines\n");
410 topcount
= make_outline(outlines
, doc
->outline_elements
,
411 doc
->n_outline_elements
, TRUE
);
412 sprintf(buf
, "/Count %d\n>>\n", topcount
);
413 objtext(outlines
, buf
);
417 * Assemble the final linear form of every object.
419 for (o
= olist
.head
; o
; o
= o
->next
) {
420 rdstringc rs
= {0, 0, NULL
};
423 sprintf(text
, "%d 0 obj\n", o
->number
);
426 if (!o
->main
.text
&& o
->stream
.text
) {
427 sprintf(text
, "<<\n/Length %d\n>>\n", o
->stream
.pos
);
428 rdaddsc(&o
->main
, text
);
431 assert(o
->main
.text
);
432 rdaddsc(&rs
, o
->main
.text
);
435 if (rs
.text
[rs
.pos
-1] != '\n')
438 if (o
->stream
.text
) {
440 * FIXME: If we ever start compressing stream data then
441 * it will have zero bytes in it, so we'll have to be
442 * more careful than this.
444 rdaddsc(&rs
, "stream\n");
445 rdaddsc(&rs
, o
->stream
.text
);
446 rdaddsc(&rs
, "\nendstream\n");
447 sfree(o
->stream
.text
);
450 rdaddsc(&rs
, "endobj\n");
457 * Write out the PDF file.
460 fp
= fopen(filename
, "wb");
462 error(err_cantopenw
, filename
);
467 * Header. I'm going to put the version IDs in the header as
468 * well, simply in PDF comments.
470 fileoff
= fprintf(fp
, "%%PDF-1.3\n");
471 for (p
= sourceform
; p
; p
= p
->next
)
472 if (p
->type
== para_VersionID
)
473 fileoff
+= pdf_versionid(fp
, p
->words
);
478 for (o
= olist
.head
; o
; o
= o
->next
) {
479 o
->fileoff
= fileoff
;
480 fwrite(o
->final
, 1, o
->size
, fp
);
485 * Cross-reference table
487 fprintf(fp
, "xref\n");
488 assert(olist
.head
->number
== 1);
489 fprintf(fp
, "0 %d\n", olist
.tail
->number
+ 1);
490 fprintf(fp
, "0000000000 65535 f \n");
491 for (o
= olist
.head
; o
; o
= o
->next
) {
493 sprintf(entry
, "%010d 00000 n \n", o
->fileoff
);
494 assert(strlen(entry
) == 20);
501 fprintf(fp
, "trailer\n<<\n/Size %d\n/Root %d 0 R\n/Info %d 0 R\n>>\n",
502 olist
.tail
->number
+ 1, cat
->number
, info
->number
);
503 fprintf(fp
, "startxref\n%d\n%%%%EOF\n", fileoff
);
510 static object
*new_object(objlist
*list
)
512 object
*obj
= snew(object
);
516 obj
->main
.text
= NULL
;
517 obj
->main
.pos
= obj
->main
.size
= 0;
518 obj
->stream
.text
= NULL
;
519 obj
->stream
.pos
= obj
->stream
.size
= 0;
521 obj
->number
= list
->number
++;
525 list
->tail
->next
= obj
;
536 static void objtext(object
*o
, char const *text
)
538 rdaddsc(&o
->main
, text
);
541 static void objstream(object
*o
, char const *text
)
543 rdaddsc(&o
->stream
, text
);
546 static void objref(object
*o
, object
*dest
)
549 sprintf(buf
, "%d 0 R", dest
->number
);
550 rdaddsc(&o
->main
, buf
);
553 static char const * const stdfonts
[] = {
554 "Times-Roman", "Times-Bold", "Times-Italic", "Times-BoldItalic",
555 "Helvetica", "Helvetica-Bold", "Helvetica-Oblique","Helvetica-BoldOblique",
556 "Courier", "Courier-Bold", "Courier-Oblique", "Courier-BoldOblique",
557 "Symbol", "ZapfDingbats"
560 static int is_std_font(char const *name
) {
562 for (i
= 0; i
< lenof(stdfonts
); i
++)
563 if (strcmp(name
, stdfonts
[i
]) == 0)
568 static void make_pages_node(object
*node
, object
*parent
, page_data
*first
,
569 page_data
*last
, object
*resources
,
576 objtext(node
, "<<\n/Type /Pages\n");
578 objtext(node
, "/Parent ");
579 objref(node
, parent
);
584 * Count the pages in this stretch, to see if there are few
585 * enough to reference directly.
588 for (page
= first
; page
; page
= page
->next
) {
594 sprintf(buf
, "/Count %d\n/Kids [\n", count
);
597 if (count
> TREE_BRANCH
) {
599 page_data
*thisfirst
, *thislast
;
603 for (i
= 0; i
< TREE_BRANCH
; i
++) {
604 int number
= (i
+1) * count
/ TREE_BRANCH
- i
* count
/ TREE_BRANCH
;
611 if (thisfirst
== thislast
) {
612 objref(node
, (object
*)thisfirst
->spare
);
613 objtext((object
*)thisfirst
->spare
, "/Parent ");
614 objref((object
*)thisfirst
->spare
, node
);
615 objtext((object
*)thisfirst
->spare
, "\n");
617 object
*newnode
= new_object(node
->list
);
618 make_pages_node(newnode
, node
, thisfirst
, thislast
,
620 objref(node
, newnode
);
625 assert(thislast
== last
|| page
== NULL
);
628 for (page
= first
; page
; page
= page
->next
) {
629 objref(node
, (object
*)page
->spare
);
631 objtext((object
*)page
->spare
, "/Parent ");
632 objref((object
*)page
->spare
, node
);
633 objtext((object
*)page
->spare
, "\n");
639 objtext(node
, "]\n");
642 objtext(node
, "/Resources ");
643 objref(node
, resources
);
647 objtext(node
, "/MediaBox ");
648 objref(node
, mediabox
);
652 objtext(node
, ">>\n");
656 * In text on the page, PDF uses the PostScript font model, which
657 * means that glyphs are identified by PS strings and hence font
658 * encoding can be managed independently of the supplied encoding
659 * of the font. However, in the document outline, the PDF spec
660 * encodes in either PDFDocEncoding (a custom superset of
661 * ISO-8859-1) or UTF-16BE.
663 static char *pdf_outline_convert(wchar_t *s
, int *len
) {
666 ret
= utoa_careful_dup(s
, CS_PDF
);
669 * Very silly special case: if the returned string begins with
670 * FE FF, then the PDF reader will mistake it for a UTF-16BE
671 * string. So in this case we give up on PDFDocEncoding and
672 * encode it in UTF-16 straight away.
674 if (ret
&& ret
[0] == '\xFE' && ret
[1] == '\xFF') {
680 ret
= utoa_dup_len(s
, CS_UTF16BE
, len
);
688 static int make_outline(object
*parent
, outline_element
*items
, int n
,
691 int level
, totalcount
= 0;
692 outline_element
*itemp
;
693 object
*curr
, *prev
= NULL
, *first
= NULL
, *last
= NULL
;
697 level
= items
->level
;
704 * Here we expect to be sitting on an item at the given
705 * level. So we start by constructing an outline entry for
708 assert(items
->level
== level
);
710 title
= pdf_outline_convert(items
->pdata
->outline_title
, &titlelen
);
713 curr
= new_object(parent
->list
);
714 if (!first
) first
= curr
;
716 objtext(curr
, "<<\n/Title ");
717 pdf_string_len(objtext
, curr
, title
, titlelen
);
719 objtext(curr
, "\n/Parent ");
720 objref(curr
, parent
);
721 objtext(curr
, "\n/Dest [");
722 objref(curr
, (object
*)items
->pdata
->first
->page
->spare
);
723 objtext(curr
, " /XYZ null null null]\n");
725 objtext(curr
, "/Prev ");
729 objtext(prev
, "/Next ");
731 objtext(prev
, "\n>>\n");
736 for (itemp
= items
; itemp
< items
+n
&& itemp
->level
> level
;
741 int count
= make_outline(curr
, items
, itemp
- items
, FALSE
);
746 sprintf(buf
, "/Count %d\n", count
);
753 objtext(prev
, ">>\n");
755 assert(first
&& last
);
756 objtext(parent
, "/First ");
757 objref(parent
, first
);
758 objtext(parent
, "\n/Last ");
759 objref(parent
, last
);
760 objtext(parent
, "\n");
765 static int pdf_versionid(FILE *fp
, word
*words
)
769 ret
= fprintf(fp
, "%% ");
771 for (; words
; words
= words
->next
) {
775 switch (words
->type
) {
785 type
= removeattr(words
->type
);
789 text
= utoa_dup(words
->text
, CS_ASCII
);
791 case word_WhiteSpace
:
804 ret
+= fprintf(fp
, "\n");
809 static void pdf_string_len(void (*add
)(object
*, char const *),
810 object
*o
, char const *str
, int len
)
815 for (p
= str
; len
> 0; p
++, len
--) {
817 if (*p
< ' ' || *p
> '~') {
818 sprintf(c
, "\\%03o", 0xFF & (int)*p
);
821 if (*p
== '\\' || *p
== '(' || *p
== ')')
831 static void pdf_string(void (*add
)(object
*, char const *),
832 object
*o
, char const *str
)
834 pdf_string_len(add
, o
, str
, strlen(str
));