2 * Paper printing pre-backend for Halibut.
4 * This module does all the processing common to both PostScript
5 * and PDF output: selecting fonts, line wrapping and page breaking
6 * in accordance with font metrics, laying out the contents and
7 * index pages, generally doing all the page layout. After this,
8 * bk_ps.c and bk_pdf.c should only need to do linear translations
9 * into their literal output format.
15 * - Text wrapping is suspicious in both PS and PDF: the space
16 * adjust seems to be _approximately_ working, but not exactly.
17 * I bet some rounding error compensation is required.
19 * - set up contents section now we know what sections begin on
22 * - do cross-reference rectangles
26 * - all the missing features in text rendering (code paragraphs,
27 * list bullets, indentation, section heading styles)
31 * That should bring us to the same level of functionality that
32 * original-Halibut had, and the same in PDF plus the obvious
33 * interactive navigation features. After that, in future work:
35 * - linearised PDF, perhaps?
37 * - I'm uncertain of whether I need to include a ToUnicode CMap
38 * in each of my font definitions in PDF. Currently things (by
39 * which I mean cut and paste out of acroread) seem to be
40 * working fairly happily without it, but I don't know.
53 static font_data
*make_std_font(font_list
*fontlist
, char const *name
);
54 static void wrap_paragraph(para_data
*pdata
, word
*words
,
55 int w
, int i1
, int i2
);
56 static page_data
*page_breaks(line_data
*first
, line_data
*last
,
58 static void render_line(line_data
*ldata
, int left_x
, int top_y
);
60 void *paper_pre_backend(paragraph
*sourceform
, keywordlist
*keywords
,
64 int indent
, extra_indent
, firstline_indent
;
66 line_data
*ldata
, *firstline
, *lastline
;
67 font_data
*tr
, *ti
, *cr
;
72 * FIXME: All these things ought to become configurable.
74 int paper_width
= 595 * 4096;
75 int paper_height
= 841 * 4096;
76 int left_margin
= 72 * 4096;
77 int top_margin
= 72 * 4096;
78 int right_margin
= 72 * 4096;
79 int bottom_margin
= 108 * 4096;
80 int indent_list_bullet
= 6 * 4096;
81 int indent_list
= 24 * 4096;
82 int indent_quote
= 18 * 4096;
83 int base_leading
= 4096;
84 int base_para_spacing
= 10 * 4096;
86 int base_width
= paper_width
- left_margin
- right_margin
;
87 int page_height
= paper_height
- top_margin
- bottom_margin
;
89 IGNORE(keywords
); /* FIXME */
90 IGNORE(idx
); /* FIXME */
91 IGNORE(indent_list_bullet
); /* FIXME */
94 * First, set up some font structures.
96 fontlist
= mknew(font_list
);
97 fontlist
->head
= fontlist
->tail
= NULL
;
98 tr
= make_std_font(fontlist
, "Times-Roman");
99 ti
= make_std_font(fontlist
, "Times-Italic");
100 cr
= make_std_font(fontlist
, "Courier");
103 * Go through and break up each paragraph into lines.
106 firstline
= lastline
= NULL
;
107 for (p
= sourceform
; p
; p
= p
->next
) {
108 p
->private_data
= NULL
;
112 * These paragraph types are either invisible or don't
113 * define text in the normal sense. Either way, they
114 * don't require wrapping.
120 case para_NotParaType
:
127 * These paragraph types don't require wrapping, but
128 * they do affect the line width to which we wrap the
129 * rest of the paragraphs, so we need to pay attention.
132 indent
+= indent_list
; break;
134 indent
-= indent_list
; assert(indent
>= 0); break;
136 indent
+= indent_quote
; break;
138 indent
-= indent_quote
; assert(indent
>= 0); break;
141 * This paragraph type is special. Process it
149 * All of these paragraph types require wrapping in the
150 * ordinary way. So we must supply a set of fonts, a
151 * line width and auxiliary information (e.g. bullet
152 * text) for each one.
156 case para_UnnumberedChapter
:
160 case para_BiblioCited
:
162 case para_NumberedList
:
163 case para_DescribedThing
:
164 case para_Description
:
167 pdata
= mknew(para_data
);
170 * FIXME: Subsidiary switch on paragraph type to decide
171 * what font set to use for this paragraph.
173 pdata
->fonts
[FONT_NORMAL
] = tr
;
174 pdata
->sizes
[FONT_NORMAL
] = 12;
175 pdata
->fonts
[FONT_EMPH
] = ti
;
176 pdata
->sizes
[FONT_EMPH
] = 12;
177 pdata
->fonts
[FONT_CODE
] = cr
;
178 pdata
->sizes
[FONT_CODE
] = 12;
181 * FIXME: Also select an indentation level depending on
182 * the paragraph type (list paragraphs other than
183 * para_DescribedThing need extra indent).
185 * Perhaps at some point we might even arrange for the
186 * user to be able to request indented first lines in
190 firstline_indent
= 0;
192 wrap_paragraph(pdata
, p
->words
, base_width
,
193 indent
+ firstline_indent
,
194 indent
+ extra_indent
);
197 * FIXME: Also find the auxiliary data for this
198 * paragraph. For para_Bullet it's a bullet; for
199 * para_NumberedList it's the number; for some section
200 * headings (depending on the style of section heading
201 * selected) it's the section number.
203 * Assign into pdata->first->aux_*.
206 p
->private_data
= pdata
;
209 * Set the line spacing for each line in this paragraph.
211 for (ldata
= pdata
->first
; ldata
; ldata
= ldata
->next
) {
212 if (ldata
== pdata
->first
)
213 ldata
->space_before
= base_para_spacing
/ 2;
215 ldata
->space_before
= base_leading
/ 2;
216 if (ldata
== pdata
->last
)
217 ldata
->space_after
= base_para_spacing
/ 2;
219 ldata
->space_after
= base_leading
/ 2;
220 ldata
->page_break
= FALSE
;
224 * FIXME: some kinds of section heading do require a
225 * page break before them.
232 * Link all line structures together into a big list.
234 if (p
->private_data
) {
235 pdata
= (para_data
*)p
->private_data
;
238 lastline
->next
= pdata
->first
;
239 pdata
->first
->prev
= lastline
;
241 firstline
= pdata
->first
;
242 pdata
->first
->prev
= NULL
;
244 lastline
= pdata
->last
;
250 * Now we have an enormous linked list of every line of text in
251 * the document. Break it up into pages.
253 pages
= page_breaks(firstline
, lastline
, page_height
);
256 * Now we're ready to actually lay out the pages. We do this by
257 * looping over _paragraphs_, since we may need to track cross-
258 * references between lines and even across pages.
260 for (p
= sourceform
; p
; p
= p
->next
) {
261 pdata
= (para_data
*)p
->private_data
;
264 for (ldata
= pdata
->first
; ldata
; ldata
= ldata
->next
) {
265 render_line(ldata
, left_margin
, paper_height
- top_margin
);
266 if (ldata
== pdata
->last
)
272 doc
= mknew(document
);
273 doc
->fonts
= fontlist
;
275 doc
->paper_width
= paper_width
;
276 doc
->paper_height
= paper_height
;
280 static font_encoding
*new_font_encoding(font_data
*font
)
285 fe
= mknew(font_encoding
);
288 if (font
->list
->tail
)
289 font
->list
->tail
->next
= fe
;
291 font
->list
->head
= fe
;
292 font
->list
->tail
= fe
;
297 for (i
= 0; i
< 256; i
++) {
298 fe
->vector
[i
] = NULL
;
300 fe
->to_unicode
[i
] = 0xFFFF;
306 static font_data
*make_std_font(font_list
*fontlist
, char const *name
)
314 widths
= ps_std_font_widths(name
);
318 for (nglyphs
= 0; ps_std_glyphs
[nglyphs
] != NULL
; nglyphs
++);
320 f
= mknew(font_data
);
324 f
->nglyphs
= nglyphs
;
325 f
->glyphs
= ps_std_glyphs
;
327 f
->subfont_map
= mknewa(subfont_map_entry
, nglyphs
);
330 * Our first subfont will contain all of US-ASCII. This isn't
331 * really necessary - we could just create custom subfonts
332 * precisely as the whim of render_string dictated - but
333 * instinct suggests that it might be nice to have the text in
334 * the output files look _marginally_ recognisable.
336 fe
= new_font_encoding(f
);
337 fe
->free_pos
= 0xA1; /* only the top half is free */
338 f
->latest_subfont
= fe
;
340 for (i
= 0; i
< (int)lenof(f
->bmp
); i
++)
343 for (i
= 0; i
< nglyphs
; i
++) {
345 ucs
= ps_glyph_to_unicode(f
->glyphs
[i
]);
346 assert(ucs
!= 0xFFFF);
348 if (ucs
>= 0x20 && ucs
<= 0x7E) {
349 fe
->vector
[ucs
] = f
->glyphs
[i
];
350 fe
->indices
[ucs
] = i
;
351 fe
->to_unicode
[ucs
] = ucs
;
352 f
->subfont_map
[i
].subfont
= fe
;
353 f
->subfont_map
[i
].position
= ucs
;
356 * This character is not yet assigned to a subfont.
358 f
->subfont_map
[i
].subfont
= NULL
;
359 f
->subfont_map
[i
].position
= 0;
366 static int string_width(font_data
*font
, wchar_t const *string
, int *errs
)
373 for (; *string
; string
++) {
376 index
= font
->bmp
[(unsigned short)*string
];
377 if (index
== 0xFFFF) {
381 width
+= font
->widths
[index
];
388 static int paper_width(void *vctx
, word
*word
);
390 struct paper_width_ctx
{
395 static int paper_width_list(void *vctx
, word
*text
, word
*end
) {
398 w
+= paper_width(vctx
, text
);
406 static int paper_width(void *vctx
, word
*word
)
408 struct paper_width_ctx
*ctx
= (struct paper_width_ctx
*)vctx
;
409 int style
, type
, findex
, width
, errs
;
412 switch (word
->type
) {
422 style
= towordstyle(word
->type
);
423 type
= removeattr(word
->type
);
425 findex
= (style
== word_Normal ? FONT_NORMAL
:
426 style
== word_Emph ? FONT_EMPH
:
429 if (type
== word_Normal
) {
431 } else if (type
== word_WhiteSpace
) {
432 if (findex
!= FONT_CODE
)
433 return ctx
->minspacewidth
;
436 } else /* if (type == word_Quote) */ {
437 if (word
->aux
== quote_Open
)
438 str
= L
"\x2018"; /* FIXME: configurability! */
440 str
= L
"\x2019"; /* FIXME: configurability! */
443 width
= string_width(ctx
->pdata
->fonts
[findex
], str
, &errs
);
445 if (errs
&& word
->alt
)
446 return paper_width_list(vctx
, word
->alt
, NULL
);
448 return ctx
->pdata
->sizes
[findex
] * width
;
451 static void wrap_paragraph(para_data
*pdata
, word
*words
,
452 int w
, int i1
, int i2
)
454 wrappedline
*wrapping
, *p
;
456 struct paper_width_ctx ctx
;
460 * We're going to need to store the line height in every line
461 * structure we generate.
466 for (i
= 0; i
< NFONTS
; i
++)
467 if (line_height
< pdata
->sizes
[i
])
468 line_height
= pdata
->sizes
[i
];
472 spacewidth
= (pdata
->sizes
[FONT_NORMAL
] *
473 string_width(pdata
->fonts
[FONT_NORMAL
], L
" ", NULL
));
474 if (spacewidth
== 0) {
476 * A font without a space?! Disturbing. I hope this never
477 * comes up, but I'll make a random guess anyway and set my
478 * space width to half the point size.
480 spacewidth
= pdata
->sizes
[FONT_NORMAL
] * 4096 / 2;
484 * I'm going to set the _minimum_ space width to 3/5 of the
485 * standard one, and use the standard one as the optimum.
487 ctx
.minspacewidth
= spacewidth
* 3 / 5;
490 wrapping
= wrap_para(words
, w
- i1
, w
- i2
, paper_width
, &ctx
, spacewidth
);
493 * Having done the wrapping, we now concoct a set of line_data
496 pdata
->first
= pdata
->last
= NULL
;
498 for (p
= wrapping
; p
; p
= p
->next
) {
501 int len
, wid
, spaces
;
503 ldata
= mknew(line_data
);
505 ldata
->pdata
= pdata
;
506 ldata
->first
= p
->begin
;
507 ldata
->last
= p
->end
;
508 ldata
->line_height
= line_height
;
510 ldata
->xpos
= (p
== wrapping ? i1
: i2
);
513 pdata
->last
->next
= ldata
;
514 ldata
->prev
= pdata
->last
;
516 pdata
->first
= ldata
;
522 len
= paper_width_list(&ctx
, ldata
->first
, ldata
->last
);
523 wid
= (p
== wrapping ? w
- i1
: w
- i2
);
538 if (removeattr(wd
->type
) == word_Normal
)
539 printf("%ls", wd
->text
);
540 else if (removeattr(wd
->type
) == word_WhiteSpace
)
542 else if (removeattr(wd
->type
) == word_Quote
)
543 printf(wd
->aux
== quote_Open ?
"`" : "'");
547 if (removeattr(wd
->type
) == word_WhiteSpace
)
549 if (wd
== ldata
->last
)
555 ldata
->space_adjust
= (wid
- len
) / spaces
;
557 * This tells us how much the space width needs to
558 * change from _min_spacewidth. But we want to store
559 * its difference from the _natural_ space width, to
560 * make the text rendering easier.
562 ldata
->space_adjust
+= ctx
.minspacewidth
;
563 ldata
->space_adjust
-= spacewidth
;
565 * Special case: on the last line of a paragraph, we
566 * never stretch spaces.
568 if (ldata
->space_adjust
> 0 && !p
->next
)
569 ldata
->space_adjust
= 0;
571 ldata
->space_adjust
= 0;
574 ldata
->aux_text
= NULL
;
575 ldata
->aux_left_indent
= 0;
580 static page_data
*page_breaks(line_data
*first
, line_data
*last
,
587 * Page breaking is done by a close analogue of the optimal
588 * paragraph wrapping algorithm used by wrap_para(). We work
589 * backwards from the end of the document line by line; for
590 * each line, we contemplate every possible number of lines we
591 * could put on a page starting with that line, determine a
592 * cost function for each one, add it to the pre-computed cost
593 * function for optimally page-breaking everything after that
594 * page, and pick the best option.
596 * Since my line_data structures are only used for this
597 * purpose, I might as well just store the algorithm data
601 for (l
= last
; l
; l
= l
->prev
) {
602 int minheight
, text
= 0, space
= 0;
606 for (m
= l
; m
; m
= m
->next
) {
607 if (m
!= l
&& m
->page_break
)
608 break; /* we've gone as far as we can */
611 space
+= m
->prev
->space_after
;
612 if (m
!= l
|| m
->page_break
)
613 space
+= m
->space_before
;
614 text
+= m
->line_height
;
615 minheight
= text
+ space
;
617 if (m
!= l
&& minheight
> page_height
)
621 * Compute the cost of this arrangement, as the square
622 * of the amount of wasted space on the page.
623 * Exception: if this is the last page before a
624 * mandatory break or the document end, we don't
625 * penalise a large blank area.
627 if (m
->next
&& !m
->next
->page_break
)
629 int x
= page_height
- minheight
;
636 cost
+= (x
* xf
) >> 8;
641 * FIXME: here I should introduce penalties for
642 * breaking in mid-paragraph, particularly very close
643 * to one end of a paragraph and particularly in code
647 if (m
->next
&& !m
->next
->page_break
)
648 cost
+= m
->next
->bestcost
;
650 if (l
->bestcost
== -1 || l
->bestcost
> cost
) {
652 * This is the best option yet for this starting
656 if (m
->next
&& !m
->next
->page_break
)
657 l
->shortfall
= page_height
- minheight
;
668 * Now go through the line list forwards and assemble the
678 page
= mknew(page_data
);
687 page
->first_line
= l
;
688 page
->last_line
= l
->page_last
;
690 page
->first_text
= page
->last_text
= NULL
;
693 * Now assign a y-coordinate to each line on the page.
696 for (l
= page
->first_line
; l
; l
= l
->next
) {
697 if (l
!= page
->first_line
)
698 space
+= l
->prev
->space_after
;
699 if (l
!= page
->first_line
|| l
->page_break
)
700 space
+= l
->space_before
;
701 text
+= l
->line_height
;
704 l
->ypos
= text
+ space
+
705 space
* (float)page
->first_line
->shortfall
/
706 page
->first_line
->space
;
708 if (l
== page
->last_line
)
712 l
= page
->last_line
->next
;
718 static void add_string_to_page(page_data
*page
, int x
, int y
,
719 font_encoding
*fe
, int size
, char *text
)
723 frag
= mknew(text_fragment
);
727 page
->last_text
->next
= frag
;
729 page
->first_text
= frag
;
730 page
->last_text
= frag
;
735 frag
->fontsize
= size
;
736 frag
->text
= dupstr(text
);
740 * Returns the updated x coordinate.
742 static int render_string(page_data
*page
, font_data
*font
, int fontsize
,
743 int x
, int y
, wchar_t *str
)
746 int textpos
, textwid
, glyph
;
747 font_encoding
*subfont
= NULL
, *sf
;
749 text
= mknewa(char, 1 + ustrlen(str
));
750 textpos
= textwid
= 0;
753 glyph
= font
->bmp
[*str
];
756 continue; /* nothing more we can do here */
759 * Find which subfont this character is going in.
761 sf
= font
->subfont_map
[glyph
].subfont
;
767 * This character is not yet in a subfont. Assign one.
769 if (font
->latest_subfont
->free_pos
>= 0x100)
770 font
->latest_subfont
= new_font_encoding(font
);
772 c
= font
->latest_subfont
->free_pos
++;
773 if (font
->latest_subfont
->free_pos
== 0x7F)
774 font
->latest_subfont
->free_pos
= 0xA1;
776 font
->subfont_map
[glyph
].subfont
= font
->latest_subfont
;
777 font
->subfont_map
[glyph
].position
= c
;
778 font
->latest_subfont
->vector
[c
] = font
->glyphs
[glyph
];
779 font
->latest_subfont
->indices
[c
] = glyph
;
780 font
->latest_subfont
->to_unicode
[c
] = *str
;
782 sf
= font
->latest_subfont
;
785 if (!subfont
|| sf
!= subfont
) {
787 text
[textpos
] = '\0';
788 add_string_to_page(page
, x
, y
, subfont
, fontsize
, text
);
791 assert(textpos
== 0);
797 text
[textpos
++] = font
->subfont_map
[glyph
].position
;
798 textwid
+= font
->widths
[glyph
] * fontsize
;
804 text
[textpos
] = '\0';
805 add_string_to_page(page
, x
, y
, subfont
, fontsize
, text
);
813 * Returns the updated x coordinate.
815 static int render_text(page_data
*page
, para_data
*pdata
, int x
, int y
,
816 word
*text
, word
*text_end
, int space_adjust
)
819 int style
, type
, findex
, errs
;
822 switch (text
->type
) {
831 * FIXME: we should do something with all of these!
832 * Hyperlinks and xrefs have meaning in PDF, and this
833 * is probably the right place to nail down the index
838 style
= towordstyle(text
->type
);
839 type
= removeattr(text
->type
);
841 findex
= (style
== word_Normal ? FONT_NORMAL
:
842 style
== word_Emph ? FONT_EMPH
:
845 if (type
== word_Normal
) {
847 } else if (type
== word_WhiteSpace
) {
848 x
+= pdata
->sizes
[findex
] *
849 string_width(pdata
->fonts
[findex
], L
" ", NULL
);
852 } else /* if (type == word_Quote) */ {
853 if (text
->aux
== quote_Open
)
854 str
= L
"\x2018"; /* FIXME: configurability! */
856 str
= L
"\x2019"; /* FIXME: configurability! */
859 (void) string_width(pdata
->fonts
[findex
], str
, &errs
);
861 if (errs
&& text
->alt
)
862 x
= render_text(page
, pdata
, x
, y
, text
->alt
, NULL
, space_adjust
);
864 x
= render_string(page
, pdata
->fonts
[findex
],
865 pdata
->sizes
[findex
], x
, y
, str
);
868 if (text
== text_end
)
876 static void render_line(line_data
*ldata
, int left_x
, int top_y
)
879 render_text(ldata
->page
, ldata
->pdata
, left_x
+ ldata
->aux_left_indent
,
880 top_y
- ldata
->ypos
, ldata
->aux_text
, NULL
, 0);
881 render_text(ldata
->page
, ldata
->pdata
, left_x
+ ldata
->xpos
,
882 top_y
- ldata
->ypos
, ldata
->first
, ldata
->last
,
883 ldata
->space_adjust
);