mdw@git.distorted.org.uk Git - sgt/halibut/blob - bk_pdf.c

   1 /*
   2  * PDF backend for Halibut
   3  */
   4
   5 #include <assert.h>
   6 #include "halibut.h"
   7 #include "paper.h"
   8
   9 #define TREE_BRANCH 2                  /* max branching factor in page tree */
  10
  11 paragraph *pdf_config_filename(char *filename)
  12 {
  13     paragraph *p;
  14     wchar_t *ufilename, *up;
  15     int len;
  16
  17     p = mknew(paragraph);
  18     memset(p, 0, sizeof(*p));
  19     p->type = para_Config;
  20     p->next = NULL;
  21     p->fpos.filename = "<command line>";
  22     p->fpos.line = p->fpos.col = -1;
  23
  24     ufilename = ufroma_dup(filename);
  25     len = ustrlen(ufilename) + 2 + lenof(L"pdf-filename");
  26     p->keyword = mknewa(wchar_t, len);
  27     up = p->keyword;
  28     ustrcpy(up, L"pdf-filename");
  29     up = uadv(up);
  30     ustrcpy(up, ufilename);
  31     up = uadv(up);
  32     *up = L'\0';
  33     assert(up - p->keyword < len);
  34     sfree(ufilename);
  35
  36     return p;
  37 }
  38
  39 typedef struct object_Tag object;
  40 typedef struct objlist_Tag objlist;
  41
  42 struct object_Tag {
  43     objlist *list;
  44     object *next;
  45     int number;
  46     rdstringc main, stream;
  47     int size, fileoff;
  48     char *final;
  49 };
  50
  51 struct objlist_Tag {
  52     int number;
  53     object *head, *tail;
  54 };
  55
  56 static object *new_object(objlist *list);
  57 static void objtext(object *o, char const *text);
  58 static void objstream(object *o, char const *text);
  59 static void objref(object *o, object *dest);
  60
  61 static void make_pages_node(object *node, object *parent, page_data *first,
  62                             page_data *last, object *resources);
  63 static int make_outline(object *parent, outline_element *start, int n,
  64                         int open);
  65 static int pdf_versionid(FILE *fp, word *words);
  66
  67 void pdf_backend(paragraph *sourceform, keywordlist *keywords,
  68                  indexdata *idx, void *vdoc) {
  69     document *doc = (document *)vdoc;
  70     int font_index;
  71     font_encoding *fe;
  72     page_data *page;
  73     int pageno;
  74     FILE *fp;
  75     char *filename;
  76     paragraph *p;
  77     objlist olist;
  78     object *o, *cat, *outlines, *pages, *resources;
  79     int fileoff;
  80
  81     IGNORE(keywords);
  82     IGNORE(idx);
  83
  84     filename = dupstr("output.pdf");
  85     for (p = sourceform; p; p = p->next) {
  86         if (p->type == para_Config && p->parent) {
  87             if (!ustricmp(p->keyword, L"pdf-filename")) {
  88                 sfree(filename);
  89                 filename = utoa_dup(uadv(p->keyword));
  90             }
  91         }
  92     }
  93
  94     olist.head = olist.tail = NULL;
  95     olist.number = 1;
  96
  97     cat = new_object(&olist);
  98     outlines = new_object(&olist);
  99     pages = new_object(&olist);
 100     resources = new_object(&olist);
 101
 102     /*
 103      * The catalogue just contains references to the outlines and
 104      * pages objects.
 105      */
 106     objtext(cat, "<<\n/Type /Catalog\n/Outlines ");
 107     objref(cat, outlines);
 108     objtext(cat, "\n/Pages ");
 109     objref(cat, pages);
 110     objtext(cat, "\n/PageMode /UseOutlines\n>>\n");
 111
 112     /*
 113      * Set up the resources dictionary, which mostly means
 114      * providing all the font objects and names to call them by.
 115      */
 116     font_index = 0;
 117     objtext(resources, "<<\n/Font <<\n");
 118     for (fe = doc->fonts->head; fe; fe = fe->next) {
 119         char fname[40];
 120         int i;
 121         object *font;
 122
 123         sprintf(fname, "f%d", font_index++);
 124         fe->name = dupstr(fname);
 125
 126         font = new_object(&olist);
 127
 128         objtext(resources, "/");
 129         objtext(resources, fe->name);
 130         objtext(resources, " ");
 131         objref(resources, font);
 132         objtext(resources, "\n");
 133
 134         objtext(font, "<<\n/Type /Font\n/Subtype /Type1\n/Name /");
 135         objtext(font, fe->name);
 136         objtext(font, "\n/BaseFont /");
 137         objtext(font, fe->font->name);
 138         objtext(font, "\n/Encoding <<\n/Type /Encoding\n/Differences [");
 139
 140         for (i = 0; i < 256; i++) {
 141             char buf[20];
 142             if (!fe->vector[i])
 143                 continue;
 144             sprintf(buf, "\n%d /", i);
 145             objtext(font, buf);
 146             objtext(font, fe->vector[i] ? fe->vector[i] : ".notdef");
 147         }
 148
 149         objtext(font, "\n]\n>>\n");
 150
 151         {
 152             object *widths = new_object(&olist);
 153             objtext(font, "/FirstChar 0\n/LastChar 255\n/Widths ");
 154             objref(font, widths);
 155             objtext(font, "\n");
 156             objtext(widths, "[\n");
 157             for (i = 0; i < 256; i++) {
 158                 char buf[80];
 159                 double width;
 160                 if (fe->indices[i] < 0)
 161                     width = 0.0;
 162                 else
 163                     width = fe->font->widths[fe->indices[i]];
 164                 sprintf(buf, "%g\n", 1000.0 * width / 4096.0);
 165                 objtext(widths, buf);
 166             }
 167             objtext(widths, "]\n");
 168         }
 169
 170         objtext(font, ">>\n");
 171     }
 172     objtext(resources, ">>\n>>\n");
 173
 174     /*
 175      * Define the page objects for each page, and get each one
 176      * ready to have a `Parent' specification added to it.
 177      */
 178     for (page = doc->pages; page; page = page->next) {
 179         object *opage;
 180
 181         opage = new_object(&olist);
 182         page->spare = opage;
 183         objtext(opage, "<<\n/Type /Page\n");
 184     }
 185
 186     /*
 187      * Recursively build the page tree.
 188      */
 189     make_pages_node(pages, NULL, doc->pages, NULL, resources);
 190
 191     /*
 192      * Create and render the individual pages.
 193      */
 194     pageno = 0;
 195     for (page = doc->pages; page; page = page->next) {
 196         object *opage, *cstr;
 197         rect *r;
 198         text_fragment *frag;
 199         char buf[256];
 200
 201         opage = (object *)page->spare;
 202         /*
 203          * At this point the page dictionary is already
 204          * half-written, with /Type and /Parent already present. We
 205          * continue from there.
 206          */
 207
 208         /*
 209          * The PDF spec says /Resources is required, but also says
 210          * that it's inheritable and may be omitted if it's present
 211          * in a Pages node. In our case it is: it's present in the
 212          * topmost /Pages node because we carefully put it there.
 213          * So we don't need a /Resources entry here.
 214          */
 215         sprintf(buf, "/MediaBox [0 0 %g %g]\n",
 216                 doc->paper_width / 4096.0, doc->paper_height / 4096.0);
 217         objtext(opage, buf);
 218
 219         /*
 220          * Now we're ready to define a content stream containing
 221          * the actual text on the page.
 222          */
 223         cstr = new_object(&olist);
 224         objtext(opage, "/Contents ");
 225         objref(opage, cstr);
 226         objtext(opage, "\n");
 227
 228         /*
 229          * Render any rectangles on the page.
 230          */
 231         for (r = page->first_rect; r; r = r->next) {
 232             char buf[512];
 233             sprintf(buf, "%g %g %g %g re f\n", r->x / 4096.0,
 234                     r->y / 4096.0, r->w / 4096.0, r->h / 4096.0);
 235             objstream(cstr, buf);
 236         }
 237
 238         objstream(cstr, "BT\n");
 239         for (frag = page->first_text; frag; frag = frag->next) {
 240             char *c;
 241
 242             objstream(cstr, "/");
 243             objstream(cstr, frag->fe->name);
 244             sprintf(buf, " %d Tf 1 0 0 1 %g %g Tm (", frag->fontsize,
 245                     frag->x/4096.0, frag->y/4096.0);
 246             objstream(cstr, buf);
 247
 248             for (c = frag->text; *c; c++) {
 249                 if (*c == '(' || *c == ')' || *c == '\\')
 250                     objstream(cstr, "\\");
 251                 buf[0] = *c;
 252                 buf[1] = '\0';
 253                 objstream(cstr, buf);
 254             }
 255
 256             objstream(cstr, ") Tj\n");
 257         }
 258         objstream(cstr, "ET");
 259
 260         /*
 261          * Also, we want an annotation dictionary containing the
 262          * cross-references from this page.
 263          */
 264         if (page->first_xref) {
 265             xref *xr;
 266             objtext(opage, "/Annots [\n");
 267
 268             for (xr = page->first_xref; xr; xr = xr->next) {
 269                 object *annot;
 270                 char buf[256];
 271
 272                 annot = new_object(&olist);
 273                 objref(opage, annot);
 274                 objtext(opage, "\n");
 275
 276                 objtext(annot, "<<\n/Type /Annot\n/Subtype /Link\n/Rect [");
 277                 sprintf(buf, "%g %g %g %g",
 278                         xr->lx / 4096.0, xr->by / 4096.0,
 279                         xr->rx / 4096.0, xr->ty / 4096.0);
 280                 objtext(annot, buf);
 281                 objtext(annot, "]\n/Border [0 0 0]\n");
 282
 283                 if (xr->dest.type == PAGE) {
 284                     objtext(annot, "/Dest [");
 285                     objref(annot, (object *)xr->dest.page->spare);
 286                     objtext(annot, " /XYZ null null null]\n");
 287                 } else {
 288                     char *p;
 289
 290                     objtext(annot, "/A <<\n/Type /Action\n/S /URI\n/URI (");
 291                     for (p = xr->dest.url; *p; p++) {
 292                         char c[2];
 293                         c[0] = *p;
 294                         c[1] = '\0';
 295                         if (*p == '(' || *p == ')' || *p == '\\')
 296                             objtext(annot, "\\");
 297                         objtext(annot, c);
 298                     }
 299                     objtext(annot, ")\n>>\n");
 300                 }
 301
 302                 objtext(annot, ">>\n");
 303             }
 304
 305             objtext(opage, "]\n");
 306         }
 307
 308         objtext(opage, ">>\n");
 309     }
 310
 311     /*
 312      * Set up the outlines dictionary.
 313      */
 314     {
 315         int topcount;
 316         char buf[80];
 317
 318         objtext(outlines, "<<\n/Type /Outlines\n");
 319         topcount = make_outline(outlines, doc->outline_elements,
 320                                 doc->n_outline_elements, TRUE);
 321         sprintf(buf, "/Count %d\n>>\n", topcount);
 322         objtext(outlines, buf);
 323     }
 324
 325     /*
 326      * Assemble the final linear form of every object.
 327      */
 328     for (o = olist.head; o; o = o->next) {
 329         rdstringc rs = {0, 0, NULL};
 330         char text[80];
 331
 332         sprintf(text, "%d 0 obj\n", o->number);
 333         rdaddsc(&rs, text);
 334
 335         if (!o->main.text && o->stream.text) {
 336             sprintf(text, "<<\n/Length %d\n>>\n", o->stream.pos);
 337             rdaddsc(&o->main, text);
 338         }
 339
 340         assert(o->main.text);
 341         rdaddsc(&rs, o->main.text);
 342         sfree(o->main.text);
 343
 344         if (rs.text[rs.pos-1] != '\n')
 345             rdaddc(&rs, '\n');
 346
 347         if (o->stream.text) {
 348             /*
 349              * FIXME: If we ever start compressing stream data then
 350              * it will have zero bytes in it, so we'll have to be
 351              * more careful than this.
 352              */
 353             rdaddsc(&rs, "stream\n");
 354             rdaddsc(&rs, o->stream.text);
 355             rdaddsc(&rs, "\nendstream\n");
 356             sfree(o->stream.text);
 357         }
 358
 359         rdaddsc(&rs, "endobj\n");
 360
 361         o->final = rs.text;
 362         o->size = rs.pos;
 363     }
 364
 365     /*
 366      * Write out the PDF file.
 367      */
 368
 369     fp = fopen(filename, "wb");
 370     if (!fp) {
 371         error(err_cantopenw, filename);
 372         return;
 373     }
 374
 375     /*
 376      * Header. I'm going to put the version IDs in the header as
 377      * well, simply in PDF comments.
 378      */
 379     fileoff = fprintf(fp, "%%PDF-1.3\n");
 380     for (p = sourceform; p; p = p->next)
 381         if (p->type == para_VersionID)
 382             fileoff += pdf_versionid(fp, p->words);
 383
 384     /*
 385      * Body
 386      */
 387     for (o = olist.head; o; o = o->next) {
 388         o->fileoff = fileoff;
 389         fwrite(o->final, 1, o->size, fp);
 390         fileoff += o->size;
 391     }
 392
 393     /*
 394      * Cross-reference table
 395      */
 396     fprintf(fp, "xref\n");
 397     assert(olist.head->number == 1);
 398     fprintf(fp, "0 %d\n", olist.tail->number + 1);
 399     fprintf(fp, "0000000000 65535 f \n");
 400     for (o = olist.head; o; o = o->next) {
 401         char entry[40];
 402         sprintf(entry, "%010d 00000 n \n", o->fileoff);
 403         assert(strlen(entry) == 20);
 404         fputs(entry, fp);
 405     }
 406
 407     /*
 408      * Trailer
 409      */
 410     fprintf(fp, "trailer\n<<\n/Size %d\n/Root %d 0 R\n>>\n",
 411             olist.tail->number + 1, cat->number);
 412     fprintf(fp, "startxref\n%d\n%%%%EOF\n", fileoff);
 413
 414     fclose(fp);
 415
 416     sfree(filename);
 417 }
 418
 419 static object *new_object(objlist *list)
 420 {
 421     object *obj = mknew(object);
 422
 423     obj->list = list;
 424
 425     obj->main.text = NULL;
 426     obj->main.pos = obj->main.size = 0;
 427     obj->stream.text = NULL;
 428     obj->stream.pos = obj->stream.size = 0;
 429
 430     obj->number = list->number++;
 431
 432     obj->next = NULL;
 433     if (list->tail)
 434         list->tail->next = obj;
 435     else
 436         list->head = obj;
 437     list->tail = obj;
 438
 439     obj->size = 0;
 440     obj->final = NULL;
 441
 442     return obj;
 443 }
 444
 445 static void objtext(object *o, char const *text)
 446 {
 447     rdaddsc(&o->main, text);
 448 }
 449
 450 static void objstream(object *o, char const *text)
 451 {
 452     rdaddsc(&o->stream, text);
 453 }
 454
 455 static void objref(object *o, object *dest)
 456 {
 457     char buf[40];
 458     sprintf(buf, "%d 0 R", dest->number);
 459     rdaddsc(&o->main, buf);
 460 }
 461
 462 static void make_pages_node(object *node, object *parent, page_data *first,
 463                             page_data *last, object *resources)
 464 {
 465     int count;
 466     page_data *page;
 467     char buf[80];
 468
 469     objtext(node, "<<\n/Type /Pages\n");
 470     if (parent) {
 471         objtext(node, "/Parent ");
 472         objref(node, parent);
 473         objtext(node, "\n");
 474     }
 475
 476     /*
 477      * Count the pages in this stretch, to see if there are few
 478      * enough to reference directly.
 479      */
 480     count = 0;
 481     for (page = first; page; page = page->next) {
 482         count++;
 483         if (page == last)
 484             break;
 485     }
 486
 487     sprintf(buf, "/Count %d\n/Kids [\n", count);
 488     objtext(node, buf);
 489
 490     if (count > TREE_BRANCH) {
 491         int i;
 492         page_data *thisfirst, *thislast;
 493
 494         page = first;
 495
 496         for (i = 0; i < TREE_BRANCH; i++) {
 497             int number = (i+1) * count / TREE_BRANCH - i * count / TREE_BRANCH;
 498             thisfirst = page;
 499             while (number--) {
 500                 thislast = page;
 501                 page = page->next;
 502             }
 503
 504             if (thisfirst == thislast) {
 505                 objref(node, (object *)thisfirst->spare);
 506                 objtext((object *)thisfirst->spare, "/Parent ");
 507                 objref((object *)thisfirst->spare, node);
 508                 objtext((object *)thisfirst->spare, "\n");
 509             } else {
 510                 object *newnode = new_object(node->list);
 511                 make_pages_node(newnode, node, thisfirst, thislast, NULL);
 512                 objref(node, newnode);
 513             }
 514             objtext(node, "\n");
 515         }
 516
 517         assert(thislast == last || page == NULL);
 518
 519     } else {
 520         for (page = first; page; page = page->next) {
 521             objref(node, (object *)page->spare);
 522             objtext(node, "\n");
 523             objtext((object *)page->spare, "/Parent ");
 524             objref((object *)page->spare, node);
 525             objtext((object *)page->spare, "\n");
 526             if (page == last)
 527                 break;
 528         }
 529     }
 530
 531     objtext(node, "]\n");
 532
 533     if (resources) {
 534         objtext(node, "/Resources ");
 535         objref(node, resources);
 536         objtext(node, "\n");
 537     }
 538
 539     objtext(node, ">>\n");
 540 }
 541
 542 /*
 543  * In text on the page, PDF uses the PostScript font model, which
 544  * means that glyphs are identified by PS strings and hence font
 545  * encoding can be managed independently of the supplied encoding
 546  * of the font. However, in the document outline, the PDF spec
 547  * simply asks for ordinary text strings without mentioning what
 548  * character set they are supposed to be interpreted in.
 549  *
 550  * Therefore, for the moment, I'm going to assume they're US-ASCII
 551  * only. If anyone knows better, they should let me know :-/
 552  */
 553 static int pdf_convert(wchar_t *s, char **result) {
 554     int doing = (result != 0);
 555     int ok = TRUE;
 556     char *p = NULL;
 557     int plen = 0, psize = 0;
 558
 559     for (; *s; s++) {
 560         wchar_t c = *s;
 561         char outc;
 562
 563         if (c >= 32 && c <= 126) {
 564             /* Char is OK. */
 565             outc = (char)c;
 566         } else {
 567             /* Char is not OK. */
 568             ok = FALSE;
 569             outc = 0xBF;               /* approximate the good old DEC `uh?' */
 570         }
 571         if (doing) {
 572             if (plen >= psize) {
 573                 psize = plen + 256;
 574                 p = resize(p, psize);
 575             }
 576             p[plen++] = outc;
 577         }
 578     }
 579     if (doing) {
 580         p = resize(p, plen+1);
 581         p[plen] = '\0';
 582         *result = p;
 583     }
 584     return ok;
 585 }
 586
 587 static int make_outline(object *parent, outline_element *items, int n,
 588                         int open)
 589 {
 590     int level, totalcount = 0;
 591     outline_element *itemp;
 592     object *curr, *prev = NULL, *first = NULL, *last = NULL;
 593
 594     assert(n > 0);
 595
 596     level = items->level;
 597
 598     while (n > 0) {
 599         char *title, *p;
 600
 601         /*
 602          * Here we expect to be sitting on an item at the given
 603          * level. So we start by constructing an outline entry for
 604          * that item.
 605          */
 606         assert(items->level == level);
 607
 608         pdf_convert(items->pdata->outline_title, &title);
 609
 610         totalcount++;
 611         curr = new_object(parent->list);
 612         if (!first) first = curr;
 613         last = curr;
 614         objtext(curr, "<<\n/Title (");
 615         for (p = title; *p; p++) {
 616             char c[2];
 617             if (*p == '\\' || *p == '(' || *p == ')')
 618                 objtext(curr, "\\");
 619             c[0] = *p;
 620             c[1] = '\0';
 621             objtext(curr, c);
 622         }
 623         objtext(curr, ")\n/Parent ");
 624         objref(curr, parent);
 625         objtext(curr, "\n/Dest [");
 626         objref(curr, (object *)items->pdata->first->page->spare);
 627         objtext(curr, " /XYZ null null null]\n");
 628         if (prev) {
 629             objtext(curr, "/Prev ");
 630             objref(curr, prev);
 631             objtext(curr, "\n");
 632
 633             objtext(prev, "/Next ");
 634             objref(prev, curr);
 635             objtext(prev, "\n>>\n");
 636         }
 637         prev = curr;
 638
 639         items++, n--;
 640         for (itemp = items; itemp < items+n && itemp->level > level;
 641              itemp++);
 642
 643         if (itemp > items) {
 644             char buf[80];
 645             int count = make_outline(curr, items, itemp - items, FALSE);
 646             if (!open)
 647                 count = -count;
 648             else
 649                 totalcount += count;
 650             sprintf(buf, "/Count %d\n", count);
 651             objtext(curr, buf);
 652         }
 653
 654         n -= itemp - items;
 655         items = itemp;
 656     }
 657     objtext(prev, ">>\n");
 658
 659     assert(first && last);
 660     objtext(parent, "/First ");
 661     objref(parent, first);
 662     objtext(parent, "\n/Last ");
 663     objref(parent, last);
 664     objtext(parent, "\n");
 665
 666     return totalcount;
 667 }
 668
 669 static int pdf_versionid(FILE *fp, word *words)
 670 {
 671     int ret;
 672
 673     ret = fprintf(fp, "%% ");
 674
 675     for (; words; words = words->next) {
 676         char *text;
 677         int type;
 678
 679         switch (words->type) {
 680           case word_HyperLink:
 681           case word_HyperEnd:
 682           case word_UpperXref:
 683           case word_LowerXref:
 684           case word_XrefEnd:
 685           case word_IndexRef:
 686             continue;
 687         }
 688
 689         type = removeattr(words->type);
 690
 691         switch (type) {
 692           case word_Normal:
 693             text = utoa_dup(words->text);
 694             break;
 695           case word_WhiteSpace:
 696             text = dupstr(" ");
 697             break;
 698           case word_Quote:
 699             text = dupstr("'");
 700             break;
 701         }
 702
 703         fputs(text, fp);
 704         ret += strlen(text);
 705         sfree(text);
 706     }
 707
 708     ret += fprintf(fp, "\n");
 709
 710     return ret;
 711 }