2 * PDF backend for Halibut
10 #define TREE_BRANCH 8 /* max branching factor in page tree */
12 paragraph
*pdf_config_filename(char *filename
)
14 return cmdline_cfg_simple("pdf-filename", filename
, NULL
);
17 typedef struct object_Tag object
;
18 typedef struct objlist_Tag objlist
;
24 rdstringc main
, stream
;
34 static object
*new_object(objlist
*list
);
35 static void objtext(object
*o
, char const *text
);
36 static void objstream(object
*o
, char const *text
);
37 static void pdf_string(void (*add
)(object
*, char const *),
38 object
*, char const *);
39 static void pdf_string_len(void (*add
)(object
*, char const *),
40 object
*, char const *, int);
41 static void objref(object
*o
, object
*dest
);
42 static void objdest(object
*o
, page_data
*p
);
43 static char *pdf_outline_convert(wchar_t *s
, int *len
);
45 static int is_std_font(char const *name
);
47 static void make_pages_node(object
*node
, object
*parent
, page_data
*first
,
48 page_data
*last
, object
*resources
,
50 static int make_outline(object
*parent
, outline_element
*start
, int n
,
52 static int pdf_versionid(FILE *fp
, word
*words
);
54 void pdf_backend(paragraph
*sourceform
, keywordlist
*keywords
,
55 indexdata
*idx
, void *vdoc
) {
56 document
*doc
= (document
*)vdoc
;
65 object
*o
, *info
, *cat
, *outlines
, *pages
, *resources
, *mediabox
;
71 filename
= dupstr("output.pdf");
72 for (p
= sourceform
; p
; p
= p
->next
) {
73 if (p
->type
== para_Config
) {
74 if (!ustricmp(p
->keyword
, L
"pdf-filename")) {
76 filename
= dupstr(adv(p
->origkeyword
));
81 olist
.head
= olist
.tail
= NULL
;
87 info
= new_object(&olist
);
88 objtext(info
, "<<\n");
89 if (doc
->n_outline_elements
> 0) {
94 pdf_outline_convert(doc
->outline_elements
->pdata
->outline_title
,
96 objtext(info
, "/Title ");
97 pdf_string_len(objtext
, info
, title
, titlelen
);
101 objtext(info
, "/Producer ");
102 sprintf(buf
, "Halibut, %s", version
);
103 pdf_string(objtext
, info
, buf
);
104 objtext(info
, "\n>>\n");
107 cat
= new_object(&olist
);
108 if (doc
->n_outline_elements
> 0)
109 outlines
= new_object(&olist
);
112 pages
= new_object(&olist
);
113 resources
= new_object(&olist
);
116 * The catalogue just contains references to the outlines and
117 * pages objects, and the pagelabels dictionary.
119 objtext(cat
, "<<\n/Type /Catalog");
121 objtext(cat
, "\n/Outlines ");
122 objref(cat
, outlines
);
124 objtext(cat
, "\n/Pages ");
126 /* Halibut just numbers pages 1, 2, 3, ... */
127 objtext(cat
, "\n/PageLabels<</Nums[0<</S/D>>]>>");
129 objtext(cat
, "\n/PageMode /UseOutlines");
130 objtext(cat
, "\n>>\n");
133 * Set up the resources dictionary, which mostly means
134 * providing all the font objects and names to call them by.
137 objtext(resources
, "<<\n/ProcSet [/PDF/Text]\n/Font <<\n");
138 for (fe
= doc
->fonts
->head
; fe
; fe
= fe
->next
) {
143 sprintf(fname
, "f%d", font_index
++);
144 fe
->name
= dupstr(fname
);
146 font
= new_object(&olist
);
148 objtext(resources
, "/");
149 objtext(resources
, fe
->name
);
150 objtext(resources
, " ");
151 objref(resources
, font
);
152 objtext(resources
, "\n");
154 objtext(font
, "<<\n/Type /Font\n/Subtype /Type1\n/Name /");
155 objtext(font
, fe
->name
);
156 objtext(font
, "\n/BaseFont /");
157 objtext(font
, fe
->font
->info
->name
);
158 objtext(font
, "\n/Encoding <<\n/Type /Encoding\n/Differences [");
160 for (i
= 0; i
< 256; i
++) {
165 sprintf(buf
, "\n%d", i
);
168 objtext(font
, i
% 8 ?
"/" : "\n/");
169 objtext(font
, fe
->vector
[i
]);
173 objtext(font
, "\n]\n>>\n");
175 #define FF_FIXEDPITCH 0x00000001
176 #define FF_SERIF 0x00000002
177 #define FF_SYMBOLIC 0x00000004
178 #define FF_SCRIPT 0x00000008
179 #define FF_NONSYMBOLIC 0x00000020
180 #define FF_ITALIC 0x00000040
181 #define FF_ALLCAP 0x00010000
182 #define FF_SMALLCAP 0x00020000
183 #define FF_FORCEBOLD 0x00040000
185 if (!is_std_font(fe
->font
->info
->name
)){
186 object
*widths
= new_object(&olist
);
187 object
*fontdesc
= new_object(&olist
);
188 int firstchar
= -1, lastchar
= -1;
190 font_info
const *fi
= fe
->font
->info
;
192 for (i
= 0; i
< 256; i
++)
193 if (fe
->indices
[i
] >= 0) {
194 if (firstchar
< 0) firstchar
= i
;
197 sprintf(buf
, "/FirstChar %d\n/LastChar %d\n/Widths ",
198 firstchar
, lastchar
);
200 objref(font
, widths
);
202 objtext(widths
, "[\n");
203 for (i
= firstchar
; i
<= lastchar
; i
++) {
205 if (fe
->indices
[i
] < 0)
208 width
= fi
->widths
[fe
->indices
[i
]];
209 sprintf(buf
, "%g\n", 1000.0 * width
/ FUNITS_PER_PT
);
210 objtext(widths
, buf
);
212 objtext(widths
, "]\n");
213 objtext(font
, "/FontDescriptor ");
214 objref(font
, fontdesc
);
215 objtext(fontdesc
, "<<\n/Type /FontDescriptor\n/Name /");
216 objtext(fontdesc
, fi
->name
);
218 if (fi
->italicangle
) flags
|= FF_ITALIC
;
219 flags
|= FF_NONSYMBOLIC
;
220 sprintf(buf
, "\n/Flags %d\n", flags
);
221 objtext(fontdesc
, buf
);
222 sprintf(buf
, "/FontBBox [%g %g %g %g]\n", fi
->fontbbox
[0],
223 fi
->fontbbox
[1], fi
->fontbbox
[2], fi
->fontbbox
[3]);
224 objtext(fontdesc
, buf
);
225 sprintf(buf
, "/ItalicAngle %g\n", fi
->italicangle
);
226 objtext(fontdesc
, buf
);
227 sprintf(buf
, "/Ascent %g\n", fi
->ascent
);
228 objtext(fontdesc
, buf
);
229 sprintf(buf
, "/Descent %g\n", fi
->descent
);
230 objtext(fontdesc
, buf
);
231 sprintf(buf
, "/CapHeight %g\n", fi
->capheight
);
232 objtext(fontdesc
, buf
);
233 sprintf(buf
, "/XHeight %g\n", fi
->xheight
);
234 objtext(fontdesc
, buf
);
235 sprintf(buf
, "/StemH %g\n", fi
->stemh
);
236 objtext(fontdesc
, buf
);
237 sprintf(buf
, "/StemV %g\n", fi
->stemv
);
238 objtext(fontdesc
, buf
);
240 object
*fontfile
= new_object(&olist
);
245 len
= fread(buf
, 1, sizeof(buf
)-1, fi
->fp
);
247 objstream(fontfile
, buf
);
248 } while (len
== sizeof(buf
)-1);
249 objtext(fontdesc
, "/FontFile ");
250 objref(fontdesc
, fontfile
);
252 objtext(fontdesc
, "\n>>\n");
255 objtext(font
, "\n>>\n");
257 objtext(resources
, ">>\n>>\n");
261 mediabox
= new_object(&olist
);
262 sprintf(buf
, "[0 0 %g %g]\n",
263 doc
->paper_width
/ FUNITS_PER_PT
,
264 doc
->paper_height
/ FUNITS_PER_PT
);
265 objtext(mediabox
, buf
);
269 * Define the page objects for each page, and get each one
270 * ready to have a `Parent' specification added to it.
272 for (page
= doc
->pages
; page
; page
= page
->next
) {
275 opage
= new_object(&olist
);
277 objtext(opage
, "<<\n/Type /Page\n");
281 * Recursively build the page tree.
283 make_pages_node(pages
, NULL
, doc
->pages
, NULL
, resources
, mediabox
);
286 * Create and render the individual pages.
289 for (page
= doc
->pages
; page
; page
= page
->next
) {
290 object
*opage
, *cstr
;
292 text_fragment
*frag
, *frag_end
;
296 opage
= (object
*)page
->spare
;
298 * At this point the page dictionary is already
299 * half-written, with /Type and /Parent already present. We
300 * continue from there.
304 * The PDF spec says /Resources is required, but also says
305 * that it's inheritable and may be omitted if it's present
306 * in a Pages node. In our case it is: it's present in the
307 * topmost /Pages node because we carefully put it there.
308 * So we don't need a /Resources entry here. The same applies
313 * Now we're ready to define a content stream containing
314 * the actual text on the page.
316 cstr
= new_object(&olist
);
317 objtext(opage
, "/Contents ");
319 objtext(opage
, "\n");
322 * Render any rectangles on the page.
324 for (r
= page
->first_rect
; r
; r
= r
->next
) {
326 sprintf(buf
, "%g %g %g %g re f\n",
327 r
->x
/ FUNITS_PER_PT
, r
->y
/ FUNITS_PER_PT
,
328 r
->w
/ FUNITS_PER_PT
, r
->h
/ FUNITS_PER_PT
);
329 objstream(cstr
, buf
);
332 objstream(cstr
, "BT\n");
335 * PDF tracks two separate current positions: the position
336 * given in the `line matrix' and the position given in the
337 * `text matrix'. We must therefore track both as well.
338 * They start off at -1 (unset).
343 frag
= page
->first_text
;
346 * For compactness, I'm going to group text fragments
347 * into subsequences that use the same font+size. So
348 * first find the end of this subsequence.
350 for (frag_end
= frag
;
352 frag_end
->fe
== frag
->fe
&&
353 frag_end
->fontsize
== frag
->fontsize
);
354 frag_end
= frag_end
->next
);
357 * Now select the text fragment, and prepare to display
360 objstream(cstr
, "/");
361 objstream(cstr
, frag
->fe
->name
);
362 sprintf(buf
, " %d Tf ", frag
->fontsize
);
363 objstream(cstr
, buf
);
365 while (frag
&& frag
!= frag_end
) {
367 * Place the text position for the first piece of
371 sprintf(buf
, "1 0 0 1 %g %g Tm ",
372 frag
->x
/FUNITS_PER_PT
, frag
->y
/FUNITS_PER_PT
);
374 sprintf(buf
, "%g %g Td ",
375 (frag
->x
- lx
)/FUNITS_PER_PT
,
376 (frag
->y
- ly
)/FUNITS_PER_PT
);
378 objstream(cstr
, buf
);
383 * See if we're going to use Tj (show a single
384 * string) or TJ (show an array of strings with
385 * x-spacings between them). We determine this by
386 * seeing if there's more than one text fragment in
387 * sequence with the same y-coordinate.
389 if (frag
->next
&& frag
->next
!= frag_end
&&
390 frag
->next
->y
== y
) {
394 objstream(cstr
, "[");
395 while (frag
&& frag
!= frag_end
&& frag
->y
== y
) {
398 (x
- frag
->x
) * 1000.0 /
399 (FUNITS_PER_PT
* frag
->fontsize
));
400 objstream(cstr
, buf
);
402 pdf_string(objstream
, cstr
, frag
->text
);
403 x
= frag
->x
+ frag
->width
;
406 objstream(cstr
, "]TJ\n");
412 pdf_string(objstream
, cstr
, frag
->text
);
413 objstream(cstr
, "Tj\n");
418 objstream(cstr
, "ET");
421 * Also, we want an annotation dictionary containing the
422 * cross-references from this page.
424 if (page
->first_xref
) {
426 objtext(opage
, "/Annots [\n");
428 for (xr
= page
->first_xref
; xr
; xr
= xr
->next
) {
431 objtext(opage
, "<</Subtype/Link\n/Rect[");
432 sprintf(buf
, "%g %g %g %g",
433 xr
->lx
/ FUNITS_PER_PT
, xr
->by
/ FUNITS_PER_PT
,
434 xr
->rx
/ FUNITS_PER_PT
, xr
->ty
/ FUNITS_PER_PT
);
436 objtext(opage
, "]/Border[0 0 0]\n");
438 if (xr
->dest
.type
== PAGE
) {
439 objtext(opage
, "/Dest");
440 objdest(opage
, xr
->dest
.page
);
442 objtext(opage
, "/A<</S/URI/URI");
443 pdf_string(objtext
, opage
, xr
->dest
.url
);
444 objtext(opage
, ">>");
447 objtext(opage
, ">>\n");
450 objtext(opage
, "]\n");
453 objtext(opage
, ">>\n");
457 * Set up the outlines dictionary.
463 objtext(outlines
, "<<\n/Type /Outlines\n");
464 topcount
= make_outline(outlines
, doc
->outline_elements
,
465 doc
->n_outline_elements
, TRUE
);
466 sprintf(buf
, "/Count %d\n>>\n", topcount
);
467 objtext(outlines
, buf
);
471 * Assemble the final linear form of every object.
473 for (o
= olist
.head
; o
; o
= o
->next
) {
474 rdstringc rs
= {0, 0, NULL
};
476 deflate_compress_ctx
*zcontext
;
480 sprintf(text
, "%d 0 obj\n", o
->number
);
483 if (!o
->main
.text
&& o
->stream
.text
) {
484 zcontext
= deflate_compress_new(DEFLATE_TYPE_ZLIB
);
485 deflate_compress_data(zcontext
, o
->stream
.text
, o
->stream
.pos
,
486 DEFLATE_END_OF_DATA
, &zbuf
, &zlen
);
487 deflate_compress_free(zcontext
);
488 sprintf(text
, "<<\n/Filter/FlateDecode\n/Length %d\n>>\n", zlen
);
489 rdaddsc(&o
->main
, text
);
492 assert(o
->main
.text
);
493 rdaddsc(&rs
, o
->main
.text
);
496 if (rs
.text
[rs
.pos
-1] != '\n')
499 if (o
->stream
.text
) {
500 rdaddsc(&rs
, "stream\n");
501 rdaddsn(&rs
, zbuf
, zlen
);
502 rdaddsc(&rs
, "\nendstream\n");
503 sfree(o
->stream
.text
);
507 rdaddsc(&rs
, "endobj\n");
514 * Write out the PDF file.
517 fp
= fopen(filename
, "wb");
519 error(err_cantopenw
, filename
);
524 * Header. I'm going to put the version IDs in the header as
525 * well, simply in PDF comments. The PDF Reference also suggests
526 * that binary PDF files contain four top-bit-set characters in
529 fileoff
= fprintf(fp
, "%%PDF-1.3\n%% L\xc3\xba\xc3\xb0""a\n");
530 for (p
= sourceform
; p
; p
= p
->next
)
531 if (p
->type
== para_VersionID
)
532 fileoff
+= pdf_versionid(fp
, p
->words
);
537 for (o
= olist
.head
; o
; o
= o
->next
) {
538 o
->fileoff
= fileoff
;
539 fwrite(o
->final
, 1, o
->size
, fp
);
544 * Cross-reference table
546 fprintf(fp
, "xref\n");
547 assert(olist
.head
->number
== 1);
548 fprintf(fp
, "0 %d\n", olist
.tail
->number
+ 1);
549 fprintf(fp
, "0000000000 65535 f \n");
550 for (o
= olist
.head
; o
; o
= o
->next
) {
552 sprintf(entry
, "%010d 00000 n \n", o
->fileoff
);
553 assert(strlen(entry
) == 20);
560 fprintf(fp
, "trailer\n<<\n/Size %d\n/Root %d 0 R\n/Info %d 0 R\n>>\n",
561 olist
.tail
->number
+ 1, cat
->number
, info
->number
);
562 fprintf(fp
, "startxref\n%d\n%%%%EOF\n", fileoff
);
569 static object
*new_object(objlist
*list
)
571 object
*obj
= snew(object
);
575 obj
->main
.text
= NULL
;
576 obj
->main
.pos
= obj
->main
.size
= 0;
577 obj
->stream
.text
= NULL
;
578 obj
->stream
.pos
= obj
->stream
.size
= 0;
580 obj
->number
= list
->number
++;
584 list
->tail
->next
= obj
;
595 static void objtext(object
*o
, char const *text
)
597 rdaddsc(&o
->main
, text
);
600 static void objstream(object
*o
, char const *text
)
602 rdaddsc(&o
->stream
, text
);
605 static void objref(object
*o
, object
*dest
)
608 sprintf(buf
, "%d 0 R", dest
->number
);
609 rdaddsc(&o
->main
, buf
);
612 static void objdest(object
*o
, page_data
*p
) {
614 objref(o
, (object
*)p
->spare
);
615 objtext(o
, "/XYZ null null null]");
618 static char const * const stdfonts
[] = {
619 "Times-Roman", "Times-Bold", "Times-Italic", "Times-BoldItalic",
620 "Helvetica", "Helvetica-Bold", "Helvetica-Oblique","Helvetica-BoldOblique",
621 "Courier", "Courier-Bold", "Courier-Oblique", "Courier-BoldOblique",
622 "Symbol", "ZapfDingbats"
625 static int is_std_font(char const *name
) {
627 for (i
= 0; i
< lenof(stdfonts
); i
++)
628 if (strcmp(name
, stdfonts
[i
]) == 0)
633 static void make_pages_node(object
*node
, object
*parent
, page_data
*first
,
634 page_data
*last
, object
*resources
,
641 objtext(node
, "<<\n/Type /Pages\n");
643 objtext(node
, "/Parent ");
644 objref(node
, parent
);
649 * Count the pages in this stretch, to see if there are few
650 * enough to reference directly.
653 for (page
= first
; page
; page
= page
->next
) {
659 sprintf(buf
, "/Count %d\n/Kids [\n", count
);
662 if (count
> TREE_BRANCH
) {
664 page_data
*thisfirst
, *thislast
;
668 for (i
= 0; i
< TREE_BRANCH
; i
++) {
669 int number
= (i
+1) * count
/ TREE_BRANCH
- i
* count
/ TREE_BRANCH
;
676 if (thisfirst
== thislast
) {
677 objref(node
, (object
*)thisfirst
->spare
);
678 objtext((object
*)thisfirst
->spare
, "/Parent ");
679 objref((object
*)thisfirst
->spare
, node
);
680 objtext((object
*)thisfirst
->spare
, "\n");
682 object
*newnode
= new_object(node
->list
);
683 make_pages_node(newnode
, node
, thisfirst
, thislast
,
685 objref(node
, newnode
);
690 assert(thislast
== last
|| page
== NULL
);
693 for (page
= first
; page
; page
= page
->next
) {
694 objref(node
, (object
*)page
->spare
);
696 objtext((object
*)page
->spare
, "/Parent ");
697 objref((object
*)page
->spare
, node
);
698 objtext((object
*)page
->spare
, "\n");
704 objtext(node
, "]\n");
707 objtext(node
, "/Resources ");
708 objref(node
, resources
);
712 objtext(node
, "/MediaBox ");
713 objref(node
, mediabox
);
717 objtext(node
, ">>\n");
721 * In text on the page, PDF uses the PostScript font model, which
722 * means that glyphs are identified by PS strings and hence font
723 * encoding can be managed independently of the supplied encoding
724 * of the font. However, in the document outline, the PDF spec
725 * encodes in either PDFDocEncoding (a custom superset of
726 * ISO-8859-1) or UTF-16BE.
728 static char *pdf_outline_convert(wchar_t *s
, int *len
) {
731 ret
= utoa_careful_dup(s
, CS_PDF
);
734 * Very silly special case: if the returned string begins with
735 * FE FF, then the PDF reader will mistake it for a UTF-16BE
736 * string. So in this case we give up on PDFDocEncoding and
737 * encode it in UTF-16 straight away.
739 if (ret
&& ret
[0] == '\xFE' && ret
[1] == '\xFF') {
745 ret
= utoa_dup_len(s
, CS_UTF16BE
, len
);
753 static int make_outline(object
*parent
, outline_element
*items
, int n
,
756 int level
, totalcount
= 0;
757 outline_element
*itemp
;
758 object
*curr
, *prev
= NULL
, *first
= NULL
, *last
= NULL
;
762 level
= items
->level
;
769 * Here we expect to be sitting on an item at the given
770 * level. So we start by constructing an outline entry for
773 assert(items
->level
== level
);
775 title
= pdf_outline_convert(items
->pdata
->outline_title
, &titlelen
);
778 curr
= new_object(parent
->list
);
779 if (!first
) first
= curr
;
781 objtext(curr
, "<<\n/Title ");
782 pdf_string_len(objtext
, curr
, title
, titlelen
);
784 objtext(curr
, "\n/Parent ");
785 objref(curr
, parent
);
786 objtext(curr
, "\n/Dest");
787 objdest(curr
, items
->pdata
->first
->page
);
790 objtext(curr
, "/Prev ");
794 objtext(prev
, "/Next ");
796 objtext(prev
, "\n>>\n");
801 for (itemp
= items
; itemp
< items
+n
&& itemp
->level
> level
;
806 int count
= make_outline(curr
, items
, itemp
- items
, FALSE
);
811 sprintf(buf
, "/Count %d\n", count
);
818 objtext(prev
, ">>\n");
820 assert(first
&& last
);
821 objtext(parent
, "/First ");
822 objref(parent
, first
);
823 objtext(parent
, "\n/Last ");
824 objref(parent
, last
);
825 objtext(parent
, "\n");
830 static int pdf_versionid(FILE *fp
, word
*words
)
834 ret
= fprintf(fp
, "%% ");
836 for (; words
; words
= words
->next
) {
840 switch (words
->type
) {
850 type
= removeattr(words
->type
);
854 text
= utoa_dup(words
->text
, CS_ASCII
);
856 case word_WhiteSpace
:
869 ret
+= fprintf(fp
, "\n");
874 static void pdf_string_len(void (*add
)(object
*, char const *),
875 object
*o
, char const *str
, int len
)
880 for (p
= str
; len
> 0; p
++, len
--) {
882 if (*p
< ' ' || *p
> '~') {
883 sprintf(c
, "\\%03o", 0xFF & (int)*p
);
886 if (*p
== '\\' || *p
== '(' || *p
== ')')
896 static void pdf_string(void (*add
)(object
*, char const *),
897 object
*o
, char const *str
)
899 pdf_string_len(add
, o
, str
, strlen(str
));