mdw@git.distorted.org.uk Git - sgt/halibut/blob - bk_pdf.c

   1 /*
   2  * PDF backend for Halibut
   3  */
   4
   5 #include <assert.h>
   6 #include "halibut.h"
   7 #include "paper.h"
   8
   9 #define TREE_BRANCH 2                  /* max branching factor in page tree */
  10
  11 paragraph *pdf_config_filename(char *filename)
  12 {
  13     return cmdline_cfg_simple("pdf-filename", filename, NULL);
  14 }
  15
  16 typedef struct object_Tag object;
  17 typedef struct objlist_Tag objlist;
  18
  19 struct object_Tag {
  20     objlist *list;
  21     object *next;
  22     int number;
  23     rdstringc main, stream;
  24     int size, fileoff;
  25     char *final;
  26 };
  27
  28 struct objlist_Tag {
  29     int number;
  30     object *head, *tail;
  31 };
  32
  33 static object *new_object(objlist *list);
  34 static void objtext(object *o, char const *text);
  35 static void objstream(object *o, char const *text);
  36 static void pdf_string(void (*add)(object *, char const *),
  37                        object *, char const *);
  38 static void objref(object *o, object *dest);
  39
  40 static void make_pages_node(object *node, object *parent, page_data *first,
  41                             page_data *last, object *resources);
  42 static int make_outline(object *parent, outline_element *start, int n,
  43                         int open);
  44 static int pdf_versionid(FILE *fp, word *words);
  45
  46 void pdf_backend(paragraph *sourceform, keywordlist *keywords,
  47                  indexdata *idx, void *vdoc) {
  48     document *doc = (document *)vdoc;
  49     int font_index;
  50     font_encoding *fe;
  51     page_data *page;
  52     int pageno;
  53     FILE *fp;
  54     char *filename;
  55     paragraph *p;
  56     objlist olist;
  57     object *o, *cat, *outlines, *pages, *resources;
  58     int fileoff;
  59
  60     IGNORE(keywords);
  61     IGNORE(idx);
  62
  63     filename = dupstr("output.pdf");
  64     for (p = sourceform; p; p = p->next) {
  65         if (p->type == para_Config && p->parent) {
  66             if (!ustricmp(p->keyword, L"pdf-filename")) {
  67                 sfree(filename);
  68                 filename = dupstr(adv(p->origkeyword));
  69             }
  70         }
  71     }
  72
  73     olist.head = olist.tail = NULL;
  74     olist.number = 1;
  75
  76     cat = new_object(&olist);
  77     outlines = new_object(&olist);
  78     pages = new_object(&olist);
  79     resources = new_object(&olist);
  80
  81     /*
  82      * The catalogue just contains references to the outlines and
  83      * pages objects.
  84      */
  85     objtext(cat, "<<\n/Type /Catalog\n/Outlines ");
  86     objref(cat, outlines);
  87     objtext(cat, "\n/Pages ");
  88     objref(cat, pages);
  89     objtext(cat, "\n/PageMode /UseOutlines\n>>\n");
  90
  91     /*
  92      * Set up the resources dictionary, which mostly means
  93      * providing all the font objects and names to call them by.
  94      */
  95     font_index = 0;
  96     objtext(resources, "<<\n/Font <<\n");
  97     for (fe = doc->fonts->head; fe; fe = fe->next) {
  98         char fname[40];
  99         int i;
 100         object *font;
 101
 102         sprintf(fname, "f%d", font_index++);
 103         fe->name = dupstr(fname);
 104
 105         font = new_object(&olist);
 106
 107         objtext(resources, "/");
 108         objtext(resources, fe->name);
 109         objtext(resources, " ");
 110         objref(resources, font);
 111         objtext(resources, "\n");
 112
 113         objtext(font, "<<\n/Type /Font\n/Subtype /Type1\n/Name /");
 114         objtext(font, fe->name);
 115         objtext(font, "\n/BaseFont /");
 116         objtext(font, fe->font->name);
 117         objtext(font, "\n/Encoding <<\n/Type /Encoding\n/Differences [");
 118
 119         for (i = 0; i < 256; i++) {
 120             char buf[20];
 121             if (!fe->vector[i])
 122                 continue;
 123             sprintf(buf, "\n%d /", i);
 124             objtext(font, buf);
 125             objtext(font, fe->vector[i] ? fe->vector[i] : ".notdef");
 126         }
 127
 128         objtext(font, "\n]\n>>\n");
 129
 130         {
 131             object *widths = new_object(&olist);
 132             objtext(font, "/FirstChar 0\n/LastChar 255\n/Widths ");
 133             objref(font, widths);
 134             objtext(font, "\n");
 135             objtext(widths, "[\n");
 136             for (i = 0; i < 256; i++) {
 137                 char buf[80];
 138                 double width;
 139                 if (fe->indices[i] < 0)
 140                     width = 0.0;
 141                 else
 142                     width = fe->font->widths[fe->indices[i]];
 143                 sprintf(buf, "%g\n", 1000.0 * width / 4096.0);
 144                 objtext(widths, buf);
 145             }
 146             objtext(widths, "]\n");
 147         }
 148
 149         objtext(font, ">>\n");
 150     }
 151     objtext(resources, ">>\n>>\n");
 152
 153     /*
 154      * Define the page objects for each page, and get each one
 155      * ready to have a `Parent' specification added to it.
 156      */
 157     for (page = doc->pages; page; page = page->next) {
 158         object *opage;
 159
 160         opage = new_object(&olist);
 161         page->spare = opage;
 162         objtext(opage, "<<\n/Type /Page\n");
 163     }
 164
 165     /*
 166      * Recursively build the page tree.
 167      */
 168     make_pages_node(pages, NULL, doc->pages, NULL, resources);
 169
 170     /*
 171      * Create and render the individual pages.
 172      */
 173     pageno = 0;
 174     for (page = doc->pages; page; page = page->next) {
 175         object *opage, *cstr;
 176         rect *r;
 177         text_fragment *frag, *frag_end;
 178         char buf[256];
 179         int x, y, lx, ly;
 180
 181         opage = (object *)page->spare;
 182         /*
 183          * At this point the page dictionary is already
 184          * half-written, with /Type and /Parent already present. We
 185          * continue from there.
 186          */
 187
 188         /*
 189          * The PDF spec says /Resources is required, but also says
 190          * that it's inheritable and may be omitted if it's present
 191          * in a Pages node. In our case it is: it's present in the
 192          * topmost /Pages node because we carefully put it there.
 193          * So we don't need a /Resources entry here.
 194          */
 195         sprintf(buf, "/MediaBox [0 0 %g %g]\n",
 196                 doc->paper_width / 4096.0, doc->paper_height / 4096.0);
 197         objtext(opage, buf);
 198
 199         /*
 200          * Now we're ready to define a content stream containing
 201          * the actual text on the page.
 202          */
 203         cstr = new_object(&olist);
 204         objtext(opage, "/Contents ");
 205         objref(opage, cstr);
 206         objtext(opage, "\n");
 207
 208         /*
 209          * Render any rectangles on the page.
 210          */
 211         for (r = page->first_rect; r; r = r->next) {
 212             char buf[512];
 213             sprintf(buf, "%g %g %g %g re f\n", r->x / 4096.0,
 214                     r->y / 4096.0, r->w / 4096.0, r->h / 4096.0);
 215             objstream(cstr, buf);
 216         }
 217
 218         objstream(cstr, "BT\n");
 219
 220         /*
 221          * PDF tracks two separate current positions: the position
 222          * given in the `line matrix' and the position given in the
 223          * `text matrix'. We must therefore track both as well.
 224          * They start off at -1 (unset).
 225          */
 226         lx = ly = -1;
 227         x = y = -1;
 228
 229         frag = page->first_text;
 230         while (frag) {
 231             /*
 232              * For compactness, I'm going to group text fragments
 233              * into subsequences that use the same font+size. So
 234              * first find the end of this subsequence.
 235              */
 236             for (frag_end = frag;
 237                  (frag_end &&
 238                   frag_end->fe == frag->fe &&
 239                   frag_end->fontsize == frag->fontsize);
 240                  frag_end = frag_end->next);
 241
 242             /*
 243              * Now select the text fragment, and prepare to display
 244              * the text.
 245              */
 246             objstream(cstr, "/");
 247             objstream(cstr, frag->fe->name);
 248             sprintf(buf, " %d Tf ", frag->fontsize);
 249             objstream(cstr, buf);
 250
 251             while (frag && frag != frag_end) {
 252                 /*
 253                  * Place the text position for the first piece of
 254                  * text.
 255                  */
 256                 if (lx < 0) {
 257                     sprintf(buf, "1 0 0 1 %g %g Tm ",
 258                             frag->x/4096.0, frag->y/4096.0);
 259                 } else {
 260                     sprintf(buf, "%g %g Td ",
 261                             (frag->x - lx)/4096.0, (frag->y - ly)/4096.0);
 262                 }
 263                 objstream(cstr, buf);
 264                 lx = x = frag->x;
 265                 ly = y = frag->y;
 266
 267                 /*
 268                  * See if we're going to use Tj (show a single
 269                  * string) or TJ (show an array of strings with
 270                  * x-spacings between them). We determine this by
 271                  * seeing if there's more than one text fragment in
 272                  * sequence with the same y-coordinate.
 273                  */
 274                 if (frag->next && frag->next != frag_end &&
 275                     frag->next->y == y) {
 276                     /*
 277                      * The TJ strategy.
 278                      */
 279                     objstream(cstr, "[");
 280                     while (frag && frag != frag_end && frag->y == y) {
 281                         if (frag->x != x) {
 282                             sprintf(buf, "%g",
 283                                     (x - frag->x) * 1000.0 /
 284                                     (4096.0 * frag->fontsize));
 285                             objstream(cstr, buf);
 286                         }
 287                         pdf_string(objstream, cstr, frag->text);
 288                         x = frag->x + frag->width;
 289                         frag = frag->next;
 290                     }
 291                     objstream(cstr, "]TJ\n");
 292                 } else
 293                 {
 294                     /*
 295                      * The Tj strategy.
 296                      */
 297                     pdf_string(objstream, cstr, frag->text);
 298                     objstream(cstr, "Tj\n");
 299                     frag = frag->next;
 300                 }
 301             }
 302         }
 303         objstream(cstr, "ET");
 304
 305         /*
 306          * Also, we want an annotation dictionary containing the
 307          * cross-references from this page.
 308          */
 309         if (page->first_xref) {
 310             xref *xr;
 311             objtext(opage, "/Annots [\n");
 312
 313             for (xr = page->first_xref; xr; xr = xr->next) {
 314                 object *annot;
 315                 char buf[256];
 316
 317                 annot = new_object(&olist);
 318                 objref(opage, annot);
 319                 objtext(opage, "\n");
 320
 321                 objtext(annot, "<<\n/Type /Annot\n/Subtype /Link\n/Rect [");
 322                 sprintf(buf, "%g %g %g %g",
 323                         xr->lx / 4096.0, xr->by / 4096.0,
 324                         xr->rx / 4096.0, xr->ty / 4096.0);
 325                 objtext(annot, buf);
 326                 objtext(annot, "]\n/Border [0 0 0]\n");
 327
 328                 if (xr->dest.type == PAGE) {
 329                     objtext(annot, "/Dest [");
 330                     objref(annot, (object *)xr->dest.page->spare);
 331                     objtext(annot, " /XYZ null null null]\n");
 332                 } else {
 333                     objtext(annot, "/A <<\n/Type /Action\n/S /URI\n/URI ");
 334                     pdf_string(objtext, annot, xr->dest.url);
 335                     objtext(annot, "\n>>\n");
 336                 }
 337
 338                 objtext(annot, ">>\n");
 339             }
 340
 341             objtext(opage, "]\n");
 342         }
 343
 344         objtext(opage, ">>\n");
 345     }
 346
 347     /*
 348      * Set up the outlines dictionary.
 349      */
 350     {
 351         int topcount;
 352         char buf[80];
 353
 354         objtext(outlines, "<<\n/Type /Outlines\n");
 355         topcount = make_outline(outlines, doc->outline_elements,
 356                                 doc->n_outline_elements, TRUE);
 357         sprintf(buf, "/Count %d\n>>\n", topcount);
 358         objtext(outlines, buf);
 359     }
 360
 361     /*
 362      * Assemble the final linear form of every object.
 363      */
 364     for (o = olist.head; o; o = o->next) {
 365         rdstringc rs = {0, 0, NULL};
 366         char text[80];
 367
 368         sprintf(text, "%d 0 obj\n", o->number);
 369         rdaddsc(&rs, text);
 370
 371         if (!o->main.text && o->stream.text) {
 372             sprintf(text, "<<\n/Length %d\n>>\n", o->stream.pos);
 373             rdaddsc(&o->main, text);
 374         }
 375
 376         assert(o->main.text);
 377         rdaddsc(&rs, o->main.text);
 378         sfree(o->main.text);
 379
 380         if (rs.text[rs.pos-1] != '\n')
 381             rdaddc(&rs, '\n');
 382
 383         if (o->stream.text) {
 384             /*
 385              * FIXME: If we ever start compressing stream data then
 386              * it will have zero bytes in it, so we'll have to be
 387              * more careful than this.
 388              */
 389             rdaddsc(&rs, "stream\n");
 390             rdaddsc(&rs, o->stream.text);
 391             rdaddsc(&rs, "\nendstream\n");
 392             sfree(o->stream.text);
 393         }
 394
 395         rdaddsc(&rs, "endobj\n");
 396
 397         o->final = rs.text;
 398         o->size = rs.pos;
 399     }
 400
 401     /*
 402      * Write out the PDF file.
 403      */
 404
 405     fp = fopen(filename, "wb");
 406     if (!fp) {
 407         error(err_cantopenw, filename);
 408         return;
 409     }
 410
 411     /*
 412      * Header. I'm going to put the version IDs in the header as
 413      * well, simply in PDF comments.
 414      */
 415     fileoff = fprintf(fp, "%%PDF-1.3\n");
 416     for (p = sourceform; p; p = p->next)
 417         if (p->type == para_VersionID)
 418             fileoff += pdf_versionid(fp, p->words);
 419
 420     /*
 421      * Body
 422      */
 423     for (o = olist.head; o; o = o->next) {
 424         o->fileoff = fileoff;
 425         fwrite(o->final, 1, o->size, fp);
 426         fileoff += o->size;
 427     }
 428
 429     /*
 430      * Cross-reference table
 431      */
 432     fprintf(fp, "xref\n");
 433     assert(olist.head->number == 1);
 434     fprintf(fp, "0 %d\n", olist.tail->number + 1);
 435     fprintf(fp, "0000000000 65535 f \n");
 436     for (o = olist.head; o; o = o->next) {
 437         char entry[40];
 438         sprintf(entry, "%010d 00000 n \n", o->fileoff);
 439         assert(strlen(entry) == 20);
 440         fputs(entry, fp);
 441     }
 442
 443     /*
 444      * Trailer
 445      */
 446     fprintf(fp, "trailer\n<<\n/Size %d\n/Root %d 0 R\n>>\n",
 447             olist.tail->number + 1, cat->number);
 448     fprintf(fp, "startxref\n%d\n%%%%EOF\n", fileoff);
 449
 450     fclose(fp);
 451
 452     sfree(filename);
 453 }
 454
 455 static object *new_object(objlist *list)
 456 {
 457     object *obj = mknew(object);
 458
 459     obj->list = list;
 460
 461     obj->main.text = NULL;
 462     obj->main.pos = obj->main.size = 0;
 463     obj->stream.text = NULL;
 464     obj->stream.pos = obj->stream.size = 0;
 465
 466     obj->number = list->number++;
 467
 468     obj->next = NULL;
 469     if (list->tail)
 470         list->tail->next = obj;
 471     else
 472         list->head = obj;
 473     list->tail = obj;
 474
 475     obj->size = 0;
 476     obj->final = NULL;
 477
 478     return obj;
 479 }
 480
 481 static void objtext(object *o, char const *text)
 482 {
 483     rdaddsc(&o->main, text);
 484 }
 485
 486 static void objstream(object *o, char const *text)
 487 {
 488     rdaddsc(&o->stream, text);
 489 }
 490
 491 static void objref(object *o, object *dest)
 492 {
 493     char buf[40];
 494     sprintf(buf, "%d 0 R", dest->number);
 495     rdaddsc(&o->main, buf);
 496 }
 497
 498 static void make_pages_node(object *node, object *parent, page_data *first,
 499                             page_data *last, object *resources)
 500 {
 501     int count;
 502     page_data *page;
 503     char buf[80];
 504
 505     objtext(node, "<<\n/Type /Pages\n");
 506     if (parent) {
 507         objtext(node, "/Parent ");
 508         objref(node, parent);
 509         objtext(node, "\n");
 510     }
 511
 512     /*
 513      * Count the pages in this stretch, to see if there are few
 514      * enough to reference directly.
 515      */
 516     count = 0;
 517     for (page = first; page; page = page->next) {
 518         count++;
 519         if (page == last)
 520             break;
 521     }
 522
 523     sprintf(buf, "/Count %d\n/Kids [\n", count);
 524     objtext(node, buf);
 525
 526     if (count > TREE_BRANCH) {
 527         int i;
 528         page_data *thisfirst, *thislast;
 529
 530         page = first;
 531
 532         for (i = 0; i < TREE_BRANCH; i++) {
 533             int number = (i+1) * count / TREE_BRANCH - i * count / TREE_BRANCH;
 534             thisfirst = page;
 535             while (number--) {
 536                 thislast = page;
 537                 page = page->next;
 538             }
 539
 540             if (thisfirst == thislast) {
 541                 objref(node, (object *)thisfirst->spare);
 542                 objtext((object *)thisfirst->spare, "/Parent ");
 543                 objref((object *)thisfirst->spare, node);
 544                 objtext((object *)thisfirst->spare, "\n");
 545             } else {
 546                 object *newnode = new_object(node->list);
 547                 make_pages_node(newnode, node, thisfirst, thislast, NULL);
 548                 objref(node, newnode);
 549             }
 550             objtext(node, "\n");
 551         }
 552
 553         assert(thislast == last || page == NULL);
 554
 555     } else {
 556         for (page = first; page; page = page->next) {
 557             objref(node, (object *)page->spare);
 558             objtext(node, "\n");
 559             objtext((object *)page->spare, "/Parent ");
 560             objref((object *)page->spare, node);
 561             objtext((object *)page->spare, "\n");
 562             if (page == last)
 563                 break;
 564         }
 565     }
 566
 567     objtext(node, "]\n");
 568
 569     if (resources) {
 570         objtext(node, "/Resources ");
 571         objref(node, resources);
 572         objtext(node, "\n");
 573     }
 574
 575     objtext(node, ">>\n");
 576 }
 577
 578 /*
 579  * In text on the page, PDF uses the PostScript font model, which
 580  * means that glyphs are identified by PS strings and hence font
 581  * encoding can be managed independently of the supplied encoding
 582  * of the font. However, in the document outline, the PDF spec
 583  * simply asks for ordinary text strings without mentioning what
 584  * character set they are supposed to be interpreted in.
 585  *
 586  * Therefore, for the moment, I'm going to assume they're US-ASCII
 587  * only. If anyone knows better, they should let me know :-/
 588  */
 589 static int pdf_convert(wchar_t *s, char **result) {
 590     int doing = (result != 0);
 591     int ok = TRUE;
 592     char *p = NULL;
 593     int plen = 0, psize = 0;
 594
 595     for (; *s; s++) {
 596         wchar_t c = *s;
 597         char outc;
 598
 599         if (c >= 32 && c <= 126) {
 600             /* Char is OK. */
 601             outc = (char)c;
 602         } else {
 603             /* Char is not OK. */
 604             ok = FALSE;
 605             outc = 0xBF;               /* approximate the good old DEC `uh?' */
 606         }
 607         if (doing) {
 608             if (plen >= psize) {
 609                 psize = plen + 256;
 610                 p = resize(p, psize);
 611             }
 612             p[plen++] = outc;
 613         }
 614     }
 615     if (doing) {
 616         p = resize(p, plen+1);
 617         p[plen] = '\0';
 618         *result = p;
 619     }
 620     return ok;
 621 }
 622
 623 static int make_outline(object *parent, outline_element *items, int n,
 624                         int open)
 625 {
 626     int level, totalcount = 0;
 627     outline_element *itemp;
 628     object *curr, *prev = NULL, *first = NULL, *last = NULL;
 629
 630     assert(n > 0);
 631
 632     level = items->level;
 633
 634     while (n > 0) {
 635         char *title;
 636
 637         /*
 638          * Here we expect to be sitting on an item at the given
 639          * level. So we start by constructing an outline entry for
 640          * that item.
 641          */
 642         assert(items->level == level);
 643
 644         pdf_convert(items->pdata->outline_title, &title);
 645
 646         totalcount++;
 647         curr = new_object(parent->list);
 648         if (!first) first = curr;
 649         last = curr;
 650         objtext(curr, "<<\n/Title ");
 651         pdf_string(objtext, curr, title);
 652         objtext(curr, "\n/Parent ");
 653         objref(curr, parent);
 654         objtext(curr, "\n/Dest [");
 655         objref(curr, (object *)items->pdata->first->page->spare);
 656         objtext(curr, " /XYZ null null null]\n");
 657         if (prev) {
 658             objtext(curr, "/Prev ");
 659             objref(curr, prev);
 660             objtext(curr, "\n");
 661
 662             objtext(prev, "/Next ");
 663             objref(prev, curr);
 664             objtext(prev, "\n>>\n");
 665         }
 666         prev = curr;
 667
 668         items++, n--;
 669         for (itemp = items; itemp < items+n && itemp->level > level;
 670              itemp++);
 671
 672         if (itemp > items) {
 673             char buf[80];
 674             int count = make_outline(curr, items, itemp - items, FALSE);
 675             if (!open)
 676                 count = -count;
 677             else
 678                 totalcount += count;
 679             sprintf(buf, "/Count %d\n", count);
 680             objtext(curr, buf);
 681         }
 682
 683         n -= itemp - items;
 684         items = itemp;
 685     }
 686     objtext(prev, ">>\n");
 687
 688     assert(first && last);
 689     objtext(parent, "/First ");
 690     objref(parent, first);
 691     objtext(parent, "\n/Last ");
 692     objref(parent, last);
 693     objtext(parent, "\n");
 694
 695     return totalcount;
 696 }
 697
 698 static int pdf_versionid(FILE *fp, word *words)
 699 {
 700     int ret;
 701
 702     ret = fprintf(fp, "%% ");
 703
 704     for (; words; words = words->next) {
 705         char *text;
 706         int type;
 707
 708         switch (words->type) {
 709           case word_HyperLink:
 710           case word_HyperEnd:
 711           case word_UpperXref:
 712           case word_LowerXref:
 713           case word_XrefEnd:
 714           case word_IndexRef:
 715             continue;
 716         }
 717
 718         type = removeattr(words->type);
 719
 720         switch (type) {
 721           case word_Normal:
 722             text = utoa_dup(words->text, CS_ASCII);
 723             break;
 724           case word_WhiteSpace:
 725             text = dupstr(" ");
 726             break;
 727           case word_Quote:
 728             text = dupstr("'");
 729             break;
 730         }
 731
 732         fputs(text, fp);
 733         ret += strlen(text);
 734         sfree(text);
 735     }
 736
 737     ret += fprintf(fp, "\n");
 738
 739     return ret;
 740 }
 741
 742 static void pdf_string(void (*add)(object *, char const *),
 743                        object *o, char const *str)
 744 {
 745     char const *p;
 746
 747     add(o, "(");
 748     for (p = str; *p; p++) {
 749         char c[2];
 750         if (*p == '\\' || *p == '(' || *p == ')')
 751             add(o, "\\");
 752         c[0] = *p;
 753         c[1] = '\0';
 754         add(o, c);
 755     }
 756     add(o, ")");
 757 }