From: simon <simon@cda61777-01e9-0310-a592-d414129be87e>
Date: Fri, 16 Apr 2004 10:56:31 +0000 (+0000)
Subject: Compacted PS and PDF output files by removing redundant reiterations
X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/halibut/commitdiff_plain/7c8c4239f226d2dabdd84dabae9f5869d6d64f2f

Compacted PS and PDF output files by removing redundant reiterations
of the same font and position designations. Reduced the size of the
Halibut manual PDF to less than half what it started out as, and the
PS one to more like a third of its original size.


git-svn-id: svn://svn.tartarus.org/sgt/halibut@4083 cda61777-01e9-0310-a592-d414129be87e
---

diff --git a/bk_paper.c b/bk_paper.c
index 2e05028..dba4efd 100644
--- a/bk_paper.c
+++ b/bk_paper.c
@@ -14,14 +14,6 @@
  * 
  *  - linearised PDF, perhaps?
  * 
- *  - compression of output files. For the actual text display,
- *    both output formats currently average about 50-60 characters
- *    per 5-6 character word of text, and almost all of it's the
- *    same.
- *     * In PS, we can define custom text operators to make things
- * 	 more efficient.
- *     * In PDF, there already are!
- * 
  *  - I'm uncertain of whether I need to include a ToUnicode CMap
  *    in each of my font definitions in PDF. Currently things (by
  *    which I mean cut and paste out of acroread) seem to be
@@ -1563,7 +1555,8 @@ static void add_rect_to_page(page_data *page, int x, int y, int w, int h)
 }
 
 static void add_string_to_page(page_data *page, int x, int y,
-			       font_encoding *fe, int size, char *text)
+			       font_encoding *fe, int size, char *text,
+			       int width)
 {
     text_fragment *frag;
 
@@ -1581,6 +1574,7 @@ static void add_string_to_page(page_data *page, int x, int y,
     frag->fe = fe;
     frag->fontsize = size;
     frag->text = dupstr(text);
+    frag->width = width;
 }
 
 /*
@@ -1634,7 +1628,8 @@ static int render_string(page_data *page, font_data *font, int fontsize,
 	if (!subfont || sf != subfont) {
 	    if (subfont) {
 		text[textpos] = '\0';
-		add_string_to_page(page, x, y, subfont, fontsize, text);
+		add_string_to_page(page, x, y, subfont, fontsize, text,
+				   textwid);
 		x += textwid;
 	    } else {
 		assert(textpos == 0);
@@ -1651,7 +1646,7 @@ static int render_string(page_data *page, font_data *font, int fontsize,
 
     if (textpos > 0) {
 	text[textpos] = '\0';
-	add_string_to_page(page, x, y, subfont, fontsize, text);
+	add_string_to_page(page, x, y, subfont, fontsize, text, textwid);
 	x += textwid;
     }
 
diff --git a/bk_pdf.c b/bk_pdf.c
index fe10894..f6babe5 100644
--- a/bk_pdf.c
+++ b/bk_pdf.c
@@ -56,6 +56,8 @@ struct objlist_Tag {
 static object *new_object(objlist *list);
 static void objtext(object *o, char const *text);
 static void objstream(object *o, char const *text);
+static void pdf_string(void (*add)(object *, char const *),
+		       object *, char const *);
 static void objref(object *o, object *dest);
 
 static void make_pages_node(object *node, object *parent, page_data *first,
@@ -195,8 +197,9 @@ void pdf_backend(paragraph *sourceform, keywordlist *keywords,
     for (page = doc->pages; page; page = page->next) {
 	object *opage, *cstr;
 	rect *r;
-	text_fragment *frag;
+	text_fragment *frag, *frag_end;
 	char buf[256];
+	int x, y, lx, ly;
 
 	opage = (object *)page->spare;
 	/*
@@ -236,24 +239,89 @@ void pdf_backend(paragraph *sourceform, keywordlist *keywords,
 	}
 
 	objstream(cstr, "BT\n");
-	for (frag = page->first_text; frag; frag = frag->next) {
-	    char *c;
 
+	/*
+	 * PDF tracks two separate current positions: the position
+	 * given in the `line matrix' and the position given in the
+	 * `text matrix'. We must therefore track both as well.
+	 * They start off at -1 (unset).
+	 */
+	lx = ly = -1;
+	x = y = -1;
+
+	frag = page->first_text;
+	while (frag) {
+	    /*
+	     * For compactness, I'm going to group text fragments
+	     * into subsequences that use the same font+size. So
+	     * first find the end of this subsequence.
+	     */
+	    for (frag_end = frag;
+		 (frag_end &&
+		  frag_end->fe == frag->fe &&
+		  frag_end->fontsize == frag->fontsize);
+		 frag_end = frag_end->next);
+
+	    /*
+	     * Now select the text fragment, and prepare to display
+	     * the text.
+	     */
 	    objstream(cstr, "/");
 	    objstream(cstr, frag->fe->name);
-	    sprintf(buf, " %d Tf 1 0 0 1 %g %g Tm (", frag->fontsize,
-		    frag->x/4096.0, frag->y/4096.0);
+	    sprintf(buf, " %d Tf ", frag->fontsize);
 	    objstream(cstr, buf);
 
-	    for (c = frag->text; *c; c++) {
-		if (*c == '(' || *c == ')' || *c == '\\')
-		    objstream(cstr, "\\");
-		buf[0] = *c;
-		buf[1] = '\0';
+	    while (frag && frag != frag_end) {
+		/*
+		 * Place the text position for the first piece of
+		 * text.
+		 */
+		if (lx < 0) {
+		    sprintf(buf, "1 0 0 1 %g %g Tm ",
+			    frag->x/4096.0, frag->y/4096.0);
+		} else {
+		    sprintf(buf, "%g %g Td ",
+			    (frag->x - lx)/4096.0, (frag->y - ly)/4096.0);
+		}
 		objstream(cstr, buf);
+		lx = x = frag->x;
+		ly = y = frag->y;
+
+		/*
+		 * See if we're going to use Tj (show a single
+		 * string) or TJ (show an array of strings with
+		 * x-spacings between them). We determine this by
+		 * seeing if there's more than one text fragment in
+		 * sequence with the same y-coordinate.
+		 */
+		if (frag->next && frag->next != frag_end &&
+		    frag->next->y == y) {
+		    /*
+		     * The TJ strategy.
+		     */
+		    objstream(cstr, "[");
+		    while (frag && frag != frag_end && frag->y == y) {
+			if (frag->x != x) {
+			    sprintf(buf, "%g",
+				    (x - frag->x) * 1000.0 /
+				    (4096.0 * frag->fontsize));
+			    objstream(cstr, buf);
+			}
+			pdf_string(objstream, cstr, frag->text);
+			x = frag->x + frag->width;
+			frag = frag->next;
+		    }
+		    objstream(cstr, "]TJ\n");
+		} else
+		{
+		    /*
+		     * The Tj strategy.
+		     */
+		    pdf_string(objstream, cstr, frag->text);
+		    objstream(cstr, "Tj\n");
+		    frag = frag->next;
+		}
 	    }
-
-	    objstream(cstr, ") Tj\n");
 	}
 	objstream(cstr, "ET");
 
@@ -285,18 +353,9 @@ void pdf_backend(paragraph *sourceform, keywordlist *keywords,
 		    objref(annot, (object *)xr->dest.page->spare);
 		    objtext(annot, " /XYZ null null null]\n");
 		} else {
-		    char *p;
-
-		    objtext(annot, "/A <<\n/Type /Action\n/S /URI\n/URI (");
-		    for (p = xr->dest.url; *p; p++) {
-			char c[2];
-			c[0] = *p;
-			c[1] = '\0';
-			if (*p == '(' || *p == ')' || *p == '\\')
-			    objtext(annot, "\\");
-			objtext(annot, c);
-		    }
-		    objtext(annot, ")\n>>\n");
+		    objtext(annot, "/A <<\n/Type /Action\n/S /URI\n/URI ");
+		    pdf_string(objtext, annot, xr->dest.url);
+		    objtext(annot, "\n>>\n");
 		}
 
 		objtext(annot, ">>\n");
@@ -596,7 +655,7 @@ static int make_outline(object *parent, outline_element *items, int n,
     level = items->level;
 
     while (n > 0) {
-	char *title, *p;
+	char *title;
 
 	/*
 	 * Here we expect to be sitting on an item at the given
@@ -611,16 +670,9 @@ static int make_outline(object *parent, outline_element *items, int n,
 	curr = new_object(parent->list);
 	if (!first) first = curr;
 	last = curr;
-	objtext(curr, "<<\n/Title (");
-	for (p = title; *p; p++) {
-	    char c[2];
-	    if (*p == '\\' || *p == '(' || *p == ')')
-		objtext(curr, "\\");
-	    c[0] = *p;
-	    c[1] = '\0';
-	    objtext(curr, c);
-	}
-	objtext(curr, ")\n/Parent ");
+	objtext(curr, "<<\n/Title ");
+	pdf_string(objtext, curr, title);
+	objtext(curr, "\n/Parent ");
 	objref(curr, parent);
 	objtext(curr, "\n/Dest [");
 	objref(curr, (object *)items->pdata->first->page->spare);
@@ -709,3 +761,20 @@ static int pdf_versionid(FILE *fp, word *words)
 
     return ret;
 }
+
+static void pdf_string(void (*add)(object *, char const *),
+		       object *o, char const *str)
+{
+    char const *p;
+
+    add(o, "(");
+    for (p = str; *p; p++) {
+	char c[2];
+	if (*p == '\\' || *p == '(' || *p == ')')
+	    add(o, "\\");
+	c[0] = *p;
+	c[1] = '\0';
+	add(o, c);
+    }
+    add(o, ")");
+}
diff --git a/bk_ps.c b/bk_ps.c
index c9f4223..7a59ac0 100644
--- a/bk_ps.c
+++ b/bk_ps.c
@@ -73,6 +73,27 @@ void ps_backend(paragraph *sourceform, keywordlist *keywords,
     fprintf(fp, "%%%%EndComments\n");
 
     fprintf(fp, "%%%%BeginProlog\n");
+    /*
+     * Supply a prologue function which allows a reasonably
+     * compressed representation of the text on the pages.
+     * 
+     * Expects two arguments: a y-coordinate, and then an array.
+     * Elements of the array are processed sequentially as follows:
+     * 
+     *  - a number is treated as an x-coordinate
+     *  - an array is treated as a (font, size) pair
+     *  - a string is shown
+     */
+    fprintf(fp,
+	    "/t {\n"
+	    "  exch /y exch def {\n"
+	    "    /x exch def\n"
+	    "    x type [] type eq {x aload pop scalefont setfont} if\n"
+	    "    x type dup 1 type eq exch 1.0 type eq or {x y moveto} if\n"
+	    "    x type () type eq {x show} if\n"
+	    "  } forall\n"
+	    "} def\n");
+
     fprintf(fp, "%%%%EndProlog\n");
 
     fprintf(fp, "%%%%BeginSetup\n");
@@ -119,7 +140,7 @@ void ps_backend(paragraph *sourceform, keywordlist *keywords,
      */
     pageno = 0;
     for (page = doc->pages; page; page = page->next) {
-	text_fragment *frag;
+	text_fragment *frag, *frag_end;
 	rect *r;
 
 	pageno++;
@@ -154,20 +175,44 @@ void ps_backend(paragraph *sourceform, keywordlist *keywords,
 		    r->h / 4096.0, r->w / 4096.0);
 	}
 
-	for (frag = page->first_text; frag; frag = frag->next) {
+	frag = page->first_text;
+	while (frag) {
+	    font_encoding *fe;
+	    int fs;
 	    char *c;
 
-	    fprintf(fp, "%s %d scalefont setfont %g %g moveto (",
-		   frag->fe->name, frag->fontsize,
-		   frag->x/4096.0, frag->y/4096.0);
+	    /*
+	     * Collect all the adjacent text fragments with the
+	     * same y-coordinate.
+	     */
+	    for (frag_end = frag;
+		 frag_end && frag_end->y == frag->y;
+		 frag_end = frag_end->next);
+
+	    fprintf(fp, "%g[", frag->y / 4096.0);
+
+	    fe = NULL;
+	    fs = -1;
+
+	    while (frag && frag != frag_end) {
+
+		if (frag->fe != fe || frag->fontsize != fs)
+		    fprintf(fp, "[%s %d]", frag->fe->name, frag->fontsize);
+		fe = frag->fe;
+		fs = frag->fontsize;
+
+		fprintf(fp, "%g(", frag->x/4096.0);
+		for (c = frag->text; *c; c++) {
+		    if (*c == '(' || *c == ')' || *c == '\\')
+			fputc('\\', fp);
+		    fputc(*c, fp);
+		}
+		fprintf(fp, ")");
 
-	    for (c = frag->text; *c; c++) {
-		if (*c == '(' || *c == ')' || *c == '\\')
-		    fputc('\\', fp);
-		fputc(*c, fp);
+		frag = frag->next;
 	    }
 
-	    fprintf(fp, ") show\n");
+	    fprintf(fp, "]t\n");
 	}
 
 	fprintf(fp, "showpage\n");
diff --git a/paper.h b/paper.h
index 8bb3061..d6f34b9 100644
--- a/paper.h
+++ b/paper.h
@@ -276,6 +276,7 @@ struct text_fragment_Tag {
     font_encoding *fe;
     int fontsize;
     char *text;
+    int width;
 };
 
 struct xref_dest_Tag {