mdw@git.distorted.org.uk Git - sgt/halibut/blob - bk_pdf.c

   1 /*
   2  * PDF backend for Halibut
   3  */
   4
   5 #include <assert.h>
   6 #include "halibut.h"
   7 #include "paper.h"
   8
   9 #define TREE_BRANCH 2                  /* max branching factor in page tree */
  10
  11 paragraph *pdf_config_filename(char *filename)
  12 {
  13     paragraph *p;
  14     wchar_t *ufilename, *up;
  15     int len;
  16
  17     p = mknew(paragraph);
  18     memset(p, 0, sizeof(*p));
  19     p->type = para_Config;
  20     p->next = NULL;
  21     p->fpos.filename = "<command line>";
  22     p->fpos.line = p->fpos.col = -1;
  23
  24     ufilename = ufroma_dup(filename);
  25     len = ustrlen(ufilename) + 2 + lenof(L"pdf-filename");
  26     p->keyword = mknewa(wchar_t, len);
  27     up = p->keyword;
  28     ustrcpy(up, L"pdf-filename");
  29     up = uadv(up);
  30     ustrcpy(up, ufilename);
  31     up = uadv(up);
  32     *up = L'\0';
  33     assert(up - p->keyword < len);
  34     sfree(ufilename);
  35
  36     return p;
  37 }
  38
  39 typedef struct object_Tag object;
  40 typedef struct objlist_Tag objlist;
  41
  42 struct object_Tag {
  43     objlist *list;
  44     object *next;
  45     int number;
  46     rdstringc main, stream;
  47     int size, fileoff;
  48     char *final;
  49 };
  50
  51 struct objlist_Tag {
  52     int number;
  53     object *head, *tail;
  54 };
  55
  56 static object *new_object(objlist *list);
  57 static void objtext(object *o, char const *text);
  58 static void objstream(object *o, char const *text);
  59 static void pdf_string(void (*add)(object *, char const *),
  60                        object *, char const *);
  61 static void objref(object *o, object *dest);
  62
  63 static void make_pages_node(object *node, object *parent, page_data *first,
  64                             page_data *last, object *resources);
  65 static int make_outline(object *parent, outline_element *start, int n,
  66                         int open);
  67 static int pdf_versionid(FILE *fp, word *words);
  68
  69 void pdf_backend(paragraph *sourceform, keywordlist *keywords,
  70                  indexdata *idx, void *vdoc) {
  71     document *doc = (document *)vdoc;
  72     int font_index;
  73     font_encoding *fe;
  74     page_data *page;
  75     int pageno;
  76     FILE *fp;
  77     char *filename;
  78     paragraph *p;
  79     objlist olist;
  80     object *o, *cat, *outlines, *pages, *resources;
  81     int fileoff;
  82
  83     IGNORE(keywords);
  84     IGNORE(idx);
  85
  86     filename = dupstr("output.pdf");
  87     for (p = sourceform; p; p = p->next) {
  88         if (p->type == para_Config && p->parent) {
  89             if (!ustricmp(p->keyword, L"pdf-filename")) {
  90                 sfree(filename);
  91                 filename = utoa_dup(uadv(p->keyword));
  92             }
  93         }
  94     }
  95
  96     olist.head = olist.tail = NULL;
  97     olist.number = 1;
  98
  99     cat = new_object(&olist);
 100     outlines = new_object(&olist);
 101     pages = new_object(&olist);
 102     resources = new_object(&olist);
 103
 104     /*
 105      * The catalogue just contains references to the outlines and
 106      * pages objects.
 107      */
 108     objtext(cat, "<<\n/Type /Catalog\n/Outlines ");
 109     objref(cat, outlines);
 110     objtext(cat, "\n/Pages ");
 111     objref(cat, pages);
 112     objtext(cat, "\n/PageMode /UseOutlines\n>>\n");
 113
 114     /*
 115      * Set up the resources dictionary, which mostly means
 116      * providing all the font objects and names to call them by.
 117      */
 118     font_index = 0;
 119     objtext(resources, "<<\n/Font <<\n");
 120     for (fe = doc->fonts->head; fe; fe = fe->next) {
 121         char fname[40];
 122         int i;
 123         object *font;
 124
 125         sprintf(fname, "f%d", font_index++);
 126         fe->name = dupstr(fname);
 127
 128         font = new_object(&olist);
 129
 130         objtext(resources, "/");
 131         objtext(resources, fe->name);
 132         objtext(resources, " ");
 133         objref(resources, font);
 134         objtext(resources, "\n");
 135
 136         objtext(font, "<<\n/Type /Font\n/Subtype /Type1\n/Name /");
 137         objtext(font, fe->name);
 138         objtext(font, "\n/BaseFont /");
 139         objtext(font, fe->font->name);
 140         objtext(font, "\n/Encoding <<\n/Type /Encoding\n/Differences [");
 141
 142         for (i = 0; i < 256; i++) {
 143             char buf[20];
 144             if (!fe->vector[i])
 145                 continue;
 146             sprintf(buf, "\n%d /", i);
 147             objtext(font, buf);
 148             objtext(font, fe->vector[i] ? fe->vector[i] : ".notdef");
 149         }
 150
 151         objtext(font, "\n]\n>>\n");
 152
 153         {
 154             object *widths = new_object(&olist);
 155             objtext(font, "/FirstChar 0\n/LastChar 255\n/Widths ");
 156             objref(font, widths);
 157             objtext(font, "\n");
 158             objtext(widths, "[\n");
 159             for (i = 0; i < 256; i++) {
 160                 char buf[80];
 161                 double width;
 162                 if (fe->indices[i] < 0)
 163                     width = 0.0;
 164                 else
 165                     width = fe->font->widths[fe->indices[i]];
 166                 sprintf(buf, "%g\n", 1000.0 * width / 4096.0);
 167                 objtext(widths, buf);
 168             }
 169             objtext(widths, "]\n");
 170         }
 171
 172         objtext(font, ">>\n");
 173     }
 174     objtext(resources, ">>\n>>\n");
 175
 176     /*
 177      * Define the page objects for each page, and get each one
 178      * ready to have a `Parent' specification added to it.
 179      */
 180     for (page = doc->pages; page; page = page->next) {
 181         object *opage;
 182
 183         opage = new_object(&olist);
 184         page->spare = opage;
 185         objtext(opage, "<<\n/Type /Page\n");
 186     }
 187
 188     /*
 189      * Recursively build the page tree.
 190      */
 191     make_pages_node(pages, NULL, doc->pages, NULL, resources);
 192
 193     /*
 194      * Create and render the individual pages.
 195      */
 196     pageno = 0;
 197     for (page = doc->pages; page; page = page->next) {
 198         object *opage, *cstr;
 199         rect *r;
 200         text_fragment *frag, *frag_end;
 201         char buf[256];
 202         int x, y, lx, ly;
 203
 204         opage = (object *)page->spare;
 205         /*
 206          * At this point the page dictionary is already
 207          * half-written, with /Type and /Parent already present. We
 208          * continue from there.
 209          */
 210
 211         /*
 212          * The PDF spec says /Resources is required, but also says
 213          * that it's inheritable and may be omitted if it's present
 214          * in a Pages node. In our case it is: it's present in the
 215          * topmost /Pages node because we carefully put it there.
 216          * So we don't need a /Resources entry here.
 217          */
 218         sprintf(buf, "/MediaBox [0 0 %g %g]\n",
 219                 doc->paper_width / 4096.0, doc->paper_height / 4096.0);
 220         objtext(opage, buf);
 221
 222         /*
 223          * Now we're ready to define a content stream containing
 224          * the actual text on the page.
 225          */
 226         cstr = new_object(&olist);
 227         objtext(opage, "/Contents ");
 228         objref(opage, cstr);
 229         objtext(opage, "\n");
 230
 231         /*
 232          * Render any rectangles on the page.
 233          */
 234         for (r = page->first_rect; r; r = r->next) {
 235             char buf[512];
 236             sprintf(buf, "%g %g %g %g re f\n", r->x / 4096.0,
 237                     r->y / 4096.0, r->w / 4096.0, r->h / 4096.0);
 238             objstream(cstr, buf);
 239         }
 240
 241         objstream(cstr, "BT\n");
 242
 243         /*
 244          * PDF tracks two separate current positions: the position
 245          * given in the `line matrix' and the position given in the
 246          * `text matrix'. We must therefore track both as well.
 247          * They start off at -1 (unset).
 248          */
 249         lx = ly = -1;
 250         x = y = -1;
 251
 252         frag = page->first_text;
 253         while (frag) {
 254             /*
 255              * For compactness, I'm going to group text fragments
 256              * into subsequences that use the same font+size. So
 257              * first find the end of this subsequence.
 258              */
 259             for (frag_end = frag;
 260                  (frag_end &&
 261                   frag_end->fe == frag->fe &&
 262                   frag_end->fontsize == frag->fontsize);
 263                  frag_end = frag_end->next);
 264
 265             /*
 266              * Now select the text fragment, and prepare to display
 267              * the text.
 268              */
 269             objstream(cstr, "/");
 270             objstream(cstr, frag->fe->name);
 271             sprintf(buf, " %d Tf ", frag->fontsize);
 272             objstream(cstr, buf);
 273
 274             while (frag && frag != frag_end) {
 275                 /*
 276                  * Place the text position for the first piece of
 277                  * text.
 278                  */
 279                 if (lx < 0) {
 280                     sprintf(buf, "1 0 0 1 %g %g Tm ",
 281                             frag->x/4096.0, frag->y/4096.0);
 282                 } else {
 283                     sprintf(buf, "%g %g Td ",
 284                             (frag->x - lx)/4096.0, (frag->y - ly)/4096.0);
 285                 }
 286                 objstream(cstr, buf);
 287                 lx = x = frag->x;
 288                 ly = y = frag->y;
 289
 290                 /*
 291                  * See if we're going to use Tj (show a single
 292                  * string) or TJ (show an array of strings with
 293                  * x-spacings between them). We determine this by
 294                  * seeing if there's more than one text fragment in
 295                  * sequence with the same y-coordinate.
 296                  */
 297                 if (frag->next && frag->next != frag_end &&
 298                     frag->next->y == y) {
 299                     /*
 300                      * The TJ strategy.
 301                      */
 302                     objstream(cstr, "[");
 303                     while (frag && frag != frag_end && frag->y == y) {
 304                         if (frag->x != x) {
 305                             sprintf(buf, "%g",
 306                                     (x - frag->x) * 1000.0 /
 307                                     (4096.0 * frag->fontsize));
 308                             objstream(cstr, buf);
 309                         }
 310                         pdf_string(objstream, cstr, frag->text);
 311                         x = frag->x + frag->width;
 312                         frag = frag->next;
 313                     }
 314                     objstream(cstr, "]TJ\n");
 315                 } else
 316                 {
 317                     /*
 318                      * The Tj strategy.
 319                      */
 320                     pdf_string(objstream, cstr, frag->text);
 321                     objstream(cstr, "Tj\n");
 322                     frag = frag->next;
 323                 }
 324             }
 325         }
 326         objstream(cstr, "ET");
 327
 328         /*
 329          * Also, we want an annotation dictionary containing the
 330          * cross-references from this page.
 331          */
 332         if (page->first_xref) {
 333             xref *xr;
 334             objtext(opage, "/Annots [\n");
 335
 336             for (xr = page->first_xref; xr; xr = xr->next) {
 337                 object *annot;
 338                 char buf[256];
 339
 340                 annot = new_object(&olist);
 341                 objref(opage, annot);
 342                 objtext(opage, "\n");
 343
 344                 objtext(annot, "<<\n/Type /Annot\n/Subtype /Link\n/Rect [");
 345                 sprintf(buf, "%g %g %g %g",
 346                         xr->lx / 4096.0, xr->by / 4096.0,
 347                         xr->rx / 4096.0, xr->ty / 4096.0);
 348                 objtext(annot, buf);
 349                 objtext(annot, "]\n/Border [0 0 0]\n");
 350
 351                 if (xr->dest.type == PAGE) {
 352                     objtext(annot, "/Dest [");
 353                     objref(annot, (object *)xr->dest.page->spare);
 354                     objtext(annot, " /XYZ null null null]\n");
 355                 } else {
 356                     objtext(annot, "/A <<\n/Type /Action\n/S /URI\n/URI ");
 357                     pdf_string(objtext, annot, xr->dest.url);
 358                     objtext(annot, "\n>>\n");
 359                 }
 360
 361                 objtext(annot, ">>\n");
 362             }
 363
 364             objtext(opage, "]\n");
 365         }
 366
 367         objtext(opage, ">>\n");
 368     }
 369
 370     /*
 371      * Set up the outlines dictionary.
 372      */
 373     {
 374         int topcount;
 375         char buf[80];
 376
 377         objtext(outlines, "<<\n/Type /Outlines\n");
 378         topcount = make_outline(outlines, doc->outline_elements,
 379                                 doc->n_outline_elements, TRUE);
 380         sprintf(buf, "/Count %d\n>>\n", topcount);
 381         objtext(outlines, buf);
 382     }
 383
 384     /*
 385      * Assemble the final linear form of every object.
 386      */
 387     for (o = olist.head; o; o = o->next) {
 388         rdstringc rs = {0, 0, NULL};
 389         char text[80];
 390
 391         sprintf(text, "%d 0 obj\n", o->number);
 392         rdaddsc(&rs, text);
 393
 394         if (!o->main.text && o->stream.text) {
 395             sprintf(text, "<<\n/Length %d\n>>\n", o->stream.pos);
 396             rdaddsc(&o->main, text);
 397         }
 398
 399         assert(o->main.text);
 400         rdaddsc(&rs, o->main.text);
 401         sfree(o->main.text);
 402
 403         if (rs.text[rs.pos-1] != '\n')
 404             rdaddc(&rs, '\n');
 405
 406         if (o->stream.text) {
 407             /*
 408              * FIXME: If we ever start compressing stream data then
 409              * it will have zero bytes in it, so we'll have to be
 410              * more careful than this.
 411              */
 412             rdaddsc(&rs, "stream\n");
 413             rdaddsc(&rs, o->stream.text);
 414             rdaddsc(&rs, "\nendstream\n");
 415             sfree(o->stream.text);
 416         }
 417
 418         rdaddsc(&rs, "endobj\n");
 419
 420         o->final = rs.text;
 421         o->size = rs.pos;
 422     }
 423
 424     /*
 425      * Write out the PDF file.
 426      */
 427
 428     fp = fopen(filename, "wb");
 429     if (!fp) {
 430         error(err_cantopenw, filename);
 431         return;
 432     }
 433
 434     /*
 435      * Header. I'm going to put the version IDs in the header as
 436      * well, simply in PDF comments.
 437      */
 438     fileoff = fprintf(fp, "%%PDF-1.3\n");
 439     for (p = sourceform; p; p = p->next)
 440         if (p->type == para_VersionID)
 441             fileoff += pdf_versionid(fp, p->words);
 442
 443     /*
 444      * Body
 445      */
 446     for (o = olist.head; o; o = o->next) {
 447         o->fileoff = fileoff;
 448         fwrite(o->final, 1, o->size, fp);
 449         fileoff += o->size;
 450     }
 451
 452     /*
 453      * Cross-reference table
 454      */
 455     fprintf(fp, "xref\n");
 456     assert(olist.head->number == 1);
 457     fprintf(fp, "0 %d\n", olist.tail->number + 1);
 458     fprintf(fp, "0000000000 65535 f \n");
 459     for (o = olist.head; o; o = o->next) {
 460         char entry[40];
 461         sprintf(entry, "%010d 00000 n \n", o->fileoff);
 462         assert(strlen(entry) == 20);
 463         fputs(entry, fp);
 464     }
 465
 466     /*
 467      * Trailer
 468      */
 469     fprintf(fp, "trailer\n<<\n/Size %d\n/Root %d 0 R\n>>\n",
 470             olist.tail->number + 1, cat->number);
 471     fprintf(fp, "startxref\n%d\n%%%%EOF\n", fileoff);
 472
 473     fclose(fp);
 474
 475     sfree(filename);
 476 }
 477
 478 static object *new_object(objlist *list)
 479 {
 480     object *obj = mknew(object);
 481
 482     obj->list = list;
 483
 484     obj->main.text = NULL;
 485     obj->main.pos = obj->main.size = 0;
 486     obj->stream.text = NULL;
 487     obj->stream.pos = obj->stream.size = 0;
 488
 489     obj->number = list->number++;
 490
 491     obj->next = NULL;
 492     if (list->tail)
 493         list->tail->next = obj;
 494     else
 495         list->head = obj;
 496     list->tail = obj;
 497
 498     obj->size = 0;
 499     obj->final = NULL;
 500
 501     return obj;
 502 }
 503
 504 static void objtext(object *o, char const *text)
 505 {
 506     rdaddsc(&o->main, text);
 507 }
 508
 509 static void objstream(object *o, char const *text)
 510 {
 511     rdaddsc(&o->stream, text);
 512 }
 513
 514 static void objref(object *o, object *dest)
 515 {
 516     char buf[40];
 517     sprintf(buf, "%d 0 R", dest->number);
 518     rdaddsc(&o->main, buf);
 519 }
 520
 521 static void make_pages_node(object *node, object *parent, page_data *first,
 522                             page_data *last, object *resources)
 523 {
 524     int count;
 525     page_data *page;
 526     char buf[80];
 527
 528     objtext(node, "<<\n/Type /Pages\n");
 529     if (parent) {
 530         objtext(node, "/Parent ");
 531         objref(node, parent);
 532         objtext(node, "\n");
 533     }
 534
 535     /*
 536      * Count the pages in this stretch, to see if there are few
 537      * enough to reference directly.
 538      */
 539     count = 0;
 540     for (page = first; page; page = page->next) {
 541         count++;
 542         if (page == last)
 543             break;
 544     }
 545
 546     sprintf(buf, "/Count %d\n/Kids [\n", count);
 547     objtext(node, buf);
 548
 549     if (count > TREE_BRANCH) {
 550         int i;
 551         page_data *thisfirst, *thislast;
 552
 553         page = first;
 554
 555         for (i = 0; i < TREE_BRANCH; i++) {
 556             int number = (i+1) * count / TREE_BRANCH - i * count / TREE_BRANCH;
 557             thisfirst = page;
 558             while (number--) {
 559                 thislast = page;
 560                 page = page->next;
 561             }
 562
 563             if (thisfirst == thislast) {
 564                 objref(node, (object *)thisfirst->spare);
 565                 objtext((object *)thisfirst->spare, "/Parent ");
 566                 objref((object *)thisfirst->spare, node);
 567                 objtext((object *)thisfirst->spare, "\n");
 568             } else {
 569                 object *newnode = new_object(node->list);
 570                 make_pages_node(newnode, node, thisfirst, thislast, NULL);
 571                 objref(node, newnode);
 572             }
 573             objtext(node, "\n");
 574         }
 575
 576         assert(thislast == last || page == NULL);
 577
 578     } else {
 579         for (page = first; page; page = page->next) {
 580             objref(node, (object *)page->spare);
 581             objtext(node, "\n");
 582             objtext((object *)page->spare, "/Parent ");
 583             objref((object *)page->spare, node);
 584             objtext((object *)page->spare, "\n");
 585             if (page == last)
 586                 break;
 587         }
 588     }
 589
 590     objtext(node, "]\n");
 591
 592     if (resources) {
 593         objtext(node, "/Resources ");
 594         objref(node, resources);
 595         objtext(node, "\n");
 596     }
 597
 598     objtext(node, ">>\n");
 599 }
 600
 601 /*
 602  * In text on the page, PDF uses the PostScript font model, which
 603  * means that glyphs are identified by PS strings and hence font
 604  * encoding can be managed independently of the supplied encoding
 605  * of the font. However, in the document outline, the PDF spec
 606  * simply asks for ordinary text strings without mentioning what
 607  * character set they are supposed to be interpreted in.
 608  *
 609  * Therefore, for the moment, I'm going to assume they're US-ASCII
 610  * only. If anyone knows better, they should let me know :-/
 611  */
 612 static int pdf_convert(wchar_t *s, char **result) {
 613     int doing = (result != 0);
 614     int ok = TRUE;
 615     char *p = NULL;
 616     int plen = 0, psize = 0;
 617
 618     for (; *s; s++) {
 619         wchar_t c = *s;
 620         char outc;
 621
 622         if (c >= 32 && c <= 126) {
 623             /* Char is OK. */
 624             outc = (char)c;
 625         } else {
 626             /* Char is not OK. */
 627             ok = FALSE;
 628             outc = 0xBF;               /* approximate the good old DEC `uh?' */
 629         }
 630         if (doing) {
 631             if (plen >= psize) {
 632                 psize = plen + 256;
 633                 p = resize(p, psize);
 634             }
 635             p[plen++] = outc;
 636         }
 637     }
 638     if (doing) {
 639         p = resize(p, plen+1);
 640         p[plen] = '\0';
 641         *result = p;
 642     }
 643     return ok;
 644 }
 645
 646 static int make_outline(object *parent, outline_element *items, int n,
 647                         int open)
 648 {
 649     int level, totalcount = 0;
 650     outline_element *itemp;
 651     object *curr, *prev = NULL, *first = NULL, *last = NULL;
 652
 653     assert(n > 0);
 654
 655     level = items->level;
 656
 657     while (n > 0) {
 658         char *title;
 659
 660         /*
 661          * Here we expect to be sitting on an item at the given
 662          * level. So we start by constructing an outline entry for
 663          * that item.
 664          */
 665         assert(items->level == level);
 666
 667         pdf_convert(items->pdata->outline_title, &title);
 668
 669         totalcount++;
 670         curr = new_object(parent->list);
 671         if (!first) first = curr;
 672         last = curr;
 673         objtext(curr, "<<\n/Title ");
 674         pdf_string(objtext, curr, title);
 675         objtext(curr, "\n/Parent ");
 676         objref(curr, parent);
 677         objtext(curr, "\n/Dest [");
 678         objref(curr, (object *)items->pdata->first->page->spare);
 679         objtext(curr, " /XYZ null null null]\n");
 680         if (prev) {
 681             objtext(curr, "/Prev ");
 682             objref(curr, prev);
 683             objtext(curr, "\n");
 684
 685             objtext(prev, "/Next ");
 686             objref(prev, curr);
 687             objtext(prev, "\n>>\n");
 688         }
 689         prev = curr;
 690
 691         items++, n--;
 692         for (itemp = items; itemp < items+n && itemp->level > level;
 693              itemp++);
 694
 695         if (itemp > items) {
 696             char buf[80];
 697             int count = make_outline(curr, items, itemp - items, FALSE);
 698             if (!open)
 699                 count = -count;
 700             else
 701                 totalcount += count;
 702             sprintf(buf, "/Count %d\n", count);
 703             objtext(curr, buf);
 704         }
 705
 706         n -= itemp - items;
 707         items = itemp;
 708     }
 709     objtext(prev, ">>\n");
 710
 711     assert(first && last);
 712     objtext(parent, "/First ");
 713     objref(parent, first);
 714     objtext(parent, "\n/Last ");
 715     objref(parent, last);
 716     objtext(parent, "\n");
 717
 718     return totalcount;
 719 }
 720
 721 static int pdf_versionid(FILE *fp, word *words)
 722 {
 723     int ret;
 724
 725     ret = fprintf(fp, "%% ");
 726
 727     for (; words; words = words->next) {
 728         char *text;
 729         int type;
 730
 731         switch (words->type) {
 732           case word_HyperLink:
 733           case word_HyperEnd:
 734           case word_UpperXref:
 735           case word_LowerXref:
 736           case word_XrefEnd:
 737           case word_IndexRef:
 738             continue;
 739         }
 740
 741         type = removeattr(words->type);
 742
 743         switch (type) {
 744           case word_Normal:
 745             text = utoa_dup(words->text);
 746             break;
 747           case word_WhiteSpace:
 748             text = dupstr(" ");
 749             break;
 750           case word_Quote:
 751             text = dupstr("'");
 752             break;
 753         }
 754
 755         fputs(text, fp);
 756         ret += strlen(text);
 757         sfree(text);
 758     }
 759
 760     ret += fprintf(fp, "\n");
 761
 762     return ret;
 763 }
 764
 765 static void pdf_string(void (*add)(object *, char const *),
 766                        object *o, char const *str)
 767 {
 768     char const *p;
 769
 770     add(o, "(");
 771     for (p = str; *p; p++) {
 772         char c[2];
 773         if (*p == '\\' || *p == '(' || *p == ')')
 774             add(o, "\\");
 775         c[0] = *p;
 776         c[1] = '\0';
 777         add(o, c);
 778     }
 779     add(o, ")");
 780 }