X-Git-Url: https://git.distorted.org.uk/~mdw/ezmlm/blobdiff_plain/5b62e993b0af39700031c2875d7f6654e6a02850..f8beb284087c279acfb30506f5bb32baa4949b44:/unfoldHDR.c

diff --git a/unfoldHDR.c b/unfoldHDR.c
new file mode 100644
index 0000000..6004f0b
--- /dev/null
+++ b/unfoldHDR.c
@@ -0,0 +1,303 @@
+/*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/
+/*$Name: ezmlm-idx-040 $*/
+
+#include "stralloc.h"
+#include "strerr.h"
+#include "case.h"
+#include "byte.h"
+#include "errtxt.h"
+#include "mime.h"
+
+static stralloc tmpdata = {0};
+
+static int trimre(cpp,cpend,prefix,fatal)
+char **cpp;
+char *cpend;
+stralloc *prefix;
+char *fatal;
+
+{
+  int r = 0;
+  register char *cp;
+  char *cpnew;
+  int junk;
+  unsigned int i,j;
+  unsigned int serial;
+
+  cp = *cpp;
+  serial = prefix->len;		/* pointer to serial number */
+  if (serial)
+    serial = byte_rchr(prefix->s,prefix->len,'#');
+
+  junk = 1;
+  while (junk) {
+    junk = 0;
+    while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++;
+    cpnew = cp;
+    while (++cpnew <= cpend) {	/* /(..+:\s)/ is a reply indicator */
+      if (*cpnew == ' ') {
+        if (cpnew < cp + 3) break;	/* at least 3 char before ' ' */
+	if (*(cpnew - 1) != ':') break;	/* require ':' before ' ' */
+	if (cpnew > cp + 5) {		/* if > 4 char before ':' require */
+	  register char ch;
+	  ch = *(cpnew - 2);		/* XX^3, XX[3], XX(3) */
+	  if (ch != ')' && ch != ']' && (ch < '0' || ch > '9'))
+	    break;
+	}
+	junk = 1;
+	r |= 1;
+	cp = cpnew + 1;
+        break;
+      }
+    }
+	/* prefix removal is complicated by the inconsistent handling of ' ' */
+	/* when there are rfc2047-encoded words in the subject. We first     */
+	/* compare prefix before "serial" ignoring space, then skip the      */
+	/* number, then compare after "serial". If both matched we've found  */
+	/* the prefix. */
+    if (serial) {
+      cpnew = cp;
+      i = 0;
+      while (i < serial && cpnew <= cpend) {
+        if (*cpnew != ' ') {
+          if (prefix->s[i] == ' ') {
+            ++i;
+            continue;
+          }
+          if (*cpnew != prefix->s[i]) break;
+          ++i;
+        }
+        ++cpnew;
+      }
+      if (i == serial) {		/* match before serial */
+        j = prefix->len;
+        if (serial != j) {		/* got a '#' */
+          while (cpnew <= cpend &&	/* skip number/space */
+		*cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew;
+          i = serial + 1;
+          while (i < j && cpnew <= cpend) {
+            if (*cpnew != ' ') {
+              if (prefix->s[i] == ' ') {
+                ++i;
+                continue;
+              }
+              if (*cpnew != prefix->s[i]) break;
+              ++i;
+            }
+            ++cpnew;
+          }
+        }
+        if (i == j) {
+          cp = cpnew;
+          junk = 1;
+          r |= 2;
+        }
+      }
+    }
+  }
+  *cpp = cp;
+  return r;
+}
+
+static int trimend(indata,np,fatal)
+char *indata;
+unsigned int *np;
+char *fatal;
+	/* looks at indata of length n from the end removing LWSP & '\n' */
+	/* and any trailing '-Reply'. Sets n to new length and returns:  */
+	/* 0 - not reply, 1 - reply. */
+{
+  char *cplast;
+  int junk;
+  int r = 0;
+
+  if (*np == 0) return 0;
+  cplast = indata + *np - 1;	/* points to last char on line */
+  junk = 1;
+  while (junk) {
+    junk = 0;
+    while (cplast >= indata &&
+             (*cplast == ' ' || *cplast == '\t' ||
+              *cplast == '\r' || *cplast == '\n')) 
+            --cplast;
+    if (cplast - indata  >= 5 && case_startb(cplast - 5,6,"-Reply")) {
+      cplast -= 6;
+      r = 1;
+      junk = 1;
+    }
+  }
+  *np = (unsigned int) (cplast - indata + 1);	/* new length */
+  return r;
+}
+
+int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal)
+char *indata;
+unsigned int n;
+stralloc *outdata;
+char *charset;
+stralloc *prefix;
+int flagtrimsub;
+char *fatal;
+	/* takes a header as indata. Removal of reply-indicators is done */
+	/* but removal of line breaks and Q and B decoding should have   */
+	/* been done. Returns a */
+	/* single line header without trailing \n or \0. Mainly, we      */
+	/* remove redundant shift codes   */
+	/* returns 0 = no reply no prefix */
+	/*         1 = reply no prefix    */
+	/*         2 = no reply, prefix   */
+	/*         3 = reply & pefix      */
+{
+  int r = 0;
+  char *cp,*cpesc,*cpnext,*cpend,*cpout;
+  char state,cset,newcset;
+  int reg,newreg;
+
+  cp = indata;		/* JIS X 0201 -> ISO646 us-ascii */
+  cpend = cp + n - 1;
+  cpnext = cp;
+  if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal);
+  if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal);
+
+  if(!case_diffb(charset,11,"iso-2022-jp")) {
+	/* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
+	/* are from the rfc. Don't ask why they have multiple length G0   */
+	/* charset designations ... JIS X 0201-roman is identical to      */
+	/* iso646 us-ascii except for currency and tilde. Making them the */
+	/* same increases hits without significant loss. JIS X 0208-1978  */
+	/* is superceded by JIS X 0208-1983 and converted here as well.   */
+
+    while (cp < cpend) {
+      if (*cp++ != ESC) continue;
+      if (*cp == '(') {
+        if (++cp > cpend) break;
+        if (*cp == 'J') *cp = 'B';
+        ++cp;
+      } else if (*cp == '$') {
+        if (++cp > cpend) break;
+        if (*cp == '@') *cp = 'B';
+        ++cp;
+      }
+    }
+		/* eliminate redundant ESC seqs */
+    cp = indata;
+    cpnext = cp;
+    reg = 6;
+    while (cp < cpend) {
+      if (*cp++ != ESC) continue;
+      cpesc = cp - 1;
+      if (*cp == '$') {
+        if (++cp > cpend) break;
+        if (*cp == 'B') newreg = 87;
+        else if (*cp == 'A') newreg = 58;
+        else if (*cp == '(') {
+          if (++cp > cpend) break;
+          if (*cp == 'C') newreg = 149;
+          else if (*cp == 'D') newreg = 159;
+          else continue;
+        } else continue;
+      } else if (*cp == '(') {
+        if (++cp > cpend) break;
+        if (*cp == 'B') newreg = 6;
+        else continue;
+      } else continue;
+      if (++cp > cpend) break;
+      while (*cp == ' ' || *cp == '\t')
+        if (++cp >= cpend) break;	/* skip space */
+      if (*cp == ESC)			/* maybe another G0 designation */
+        if (*(cp+1) == '(' || *(cp+1) == '$') {	 /* yep! */
+          if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
+          cpnext = cp;
+	  continue;
+      }
+      if (reg == newreg) {
+        if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
+        cpnext = cp;
+      } else {
+        reg = newreg;
+      }		/* copy remainder of line */
+    }
+    if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
+    if (reg != 6) {	/* need to return to us-ascii at the end of the line */
+      if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal);
+    } else {		/* maybe "-Reply at the end?" */
+      r = trimend(tmpdata.s,&(tmpdata.len),fatal);
+    }
+
+  } else if (!case_diffb(charset,11,"iso-2022-cn") ||
+             !case_diffb(charset,11,"iso-2022-kr")) {
+	/* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
+	/* -cn-ext, 'x' can be a number of different letters. In -kr it's  */
+	/* always 'C'. This routine may work also for other iso-2022 sets  */
+	/* also handles iso-2022-cn-ext */
+    cpesc = (char *) 0;	/* points to latest ESC */
+    state = SI;		/* us-ascii */
+    --cp;		/* set up for loop */
+
+    while (++cp <= cpend) {
+      if (*cp == SI || *cp == SO) {
+        if (state == *cp) {		 /* already in state. Skip shift seq */
+          if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal);
+          cpnext = cp;
+        } else				/* set new state */
+          state = *cp;
+        if (++cp > cpend) break;
+        continue;
+      }
+      if (*cp != ESC) continue;
+      if (cp + 3 > cpend) break;	/* not space for full SO-designation */
+      cpesc = cp;
+      if (*cp != '$') continue;
+      if (++cp > cpend) break;
+      if (*cp != ')') continue;
+      if (++cp > cpend) break;
+      newcset = *cp;
+      if (++cp > cpend) break;
+      while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp;
+      if (cp + 3 > cpend) break;	/* no space for full SO-designation */
+      if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')')
+		|| (newcset == cset)) {
+			/* skip if a second SO-designation right after or */
+			/* this SO-designation is already active, skip */
+        if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
+        --cp;		/* "unpeek" so that next iteration will see char */
+        cpnext = cpesc + 4;
+        continue;
+      } else {
+        cset = newcset;
+        continue;
+      }
+    }
+			/* get remainder of line */
+    if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
+    if (state != SI)	/* need to end in ascii */
+      if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal);
+    else		/* ascii end; maybe "-Reply" at the end? */
+      r = trimend(tmpdata.s,&(tmpdata.len),fatal);
+
+  } else {		/* other character sets = no special treatment */
+    r = trimend(cp,&n,fatal);		/* -reply */
+    if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal);
+  }
+
+  cp = tmpdata.s;
+  n = tmpdata.len;
+  cpend = cp + n - 1;
+  if (flagtrimsub) {	 /* remove leading reply indicators & prefix*/
+    r |= trimre(&cp,cpend,prefix,fatal);
+    n = (unsigned int) (cpend-cp+1);
+  }
+			/* there shouldn't be '\0' or '\n', but make sure as */
+			/* it would break the message index */
+  if (!stralloc_copys(outdata,"")) die_nomem(fatal);
+  if (!stralloc_ready(outdata,n)) die_nomem(fatal);
+  outdata->len = n;
+  cpout = outdata->s;
+  while (n--) {		/* '\n' and '\0' would break the subject index */
+    if (!*cp || *cp == '\n') *cpout = ' ';
+    else *cpout = *cp;
+    ++cp; ++cpout;
+  }
+  return r;
+}
+