--- /dev/null
+/*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/
+/*$Name: ezmlm-idx-040 $*/
+
+#include "stralloc.h"
+#include "strerr.h"
+#include "case.h"
+#include "byte.h"
+#include "errtxt.h"
+#include "mime.h"
+
+static stralloc tmpdata = {0};
+
+static int trimre(cpp,cpend,prefix,fatal)
+char **cpp;
+char *cpend;
+stralloc *prefix;
+char *fatal;
+
+{
+ int r = 0;
+ register char *cp;
+ char *cpnew;
+ int junk;
+ unsigned int i,j;
+ unsigned int serial;
+
+ cp = *cpp;
+ serial = prefix->len; /* pointer to serial number */
+ if (serial)
+ serial = byte_rchr(prefix->s,prefix->len,'#');
+
+ junk = 1;
+ while (junk) {
+ junk = 0;
+ while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++;
+ cpnew = cp;
+ while (++cpnew <= cpend) { /* /(..+:\s)/ is a reply indicator */
+ if (*cpnew == ' ') {
+ if (cpnew < cp + 3) break; /* at least 3 char before ' ' */
+ if (*(cpnew - 1) != ':') break; /* require ':' before ' ' */
+ if (cpnew > cp + 5) { /* if > 4 char before ':' require */
+ register char ch;
+ ch = *(cpnew - 2); /* XX^3, XX[3], XX(3) */
+ if (ch != ')' && ch != ']' && (ch < '0' || ch > '9'))
+ break;
+ }
+ junk = 1;
+ r |= 1;
+ cp = cpnew + 1;
+ break;
+ }
+ }
+ /* prefix removal is complicated by the inconsistent handling of ' ' */
+ /* when there are rfc2047-encoded words in the subject. We first */
+ /* compare prefix before "serial" ignoring space, then skip the */
+ /* number, then compare after "serial". If both matched we've found */
+ /* the prefix. */
+ if (serial) {
+ cpnew = cp;
+ i = 0;
+ while (i < serial && cpnew <= cpend) {
+ if (*cpnew != ' ') {
+ if (prefix->s[i] == ' ') {
+ ++i;
+ continue;
+ }
+ if (*cpnew != prefix->s[i]) break;
+ ++i;
+ }
+ ++cpnew;
+ }
+ if (i == serial) { /* match before serial */
+ j = prefix->len;
+ if (serial != j) { /* got a '#' */
+ while (cpnew <= cpend && /* skip number/space */
+ *cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew;
+ i = serial + 1;
+ while (i < j && cpnew <= cpend) {
+ if (*cpnew != ' ') {
+ if (prefix->s[i] == ' ') {
+ ++i;
+ continue;
+ }
+ if (*cpnew != prefix->s[i]) break;
+ ++i;
+ }
+ ++cpnew;
+ }
+ }
+ if (i == j) {
+ cp = cpnew;
+ junk = 1;
+ r |= 2;
+ }
+ }
+ }
+ }
+ *cpp = cp;
+ return r;
+}
+
+static int trimend(indata,np,fatal)
+char *indata;
+unsigned int *np;
+char *fatal;
+ /* looks at indata of length n from the end removing LWSP & '\n' */
+ /* and any trailing '-Reply'. Sets n to new length and returns: */
+ /* 0 - not reply, 1 - reply. */
+{
+ char *cplast;
+ int junk;
+ int r = 0;
+
+ if (*np == 0) return 0;
+ cplast = indata + *np - 1; /* points to last char on line */
+ junk = 1;
+ while (junk) {
+ junk = 0;
+ while (cplast >= indata &&
+ (*cplast == ' ' || *cplast == '\t' ||
+ *cplast == '\r' || *cplast == '\n'))
+ --cplast;
+ if (cplast - indata >= 5 && case_startb(cplast - 5,6,"-Reply")) {
+ cplast -= 6;
+ r = 1;
+ junk = 1;
+ }
+ }
+ *np = (unsigned int) (cplast - indata + 1); /* new length */
+ return r;
+}
+
+int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal)
+char *indata;
+unsigned int n;
+stralloc *outdata;
+char *charset;
+stralloc *prefix;
+int flagtrimsub;
+char *fatal;
+ /* takes a header as indata. Removal of reply-indicators is done */
+ /* but removal of line breaks and Q and B decoding should have */
+ /* been done. Returns a */
+ /* single line header without trailing \n or \0. Mainly, we */
+ /* remove redundant shift codes */
+ /* returns 0 = no reply no prefix */
+ /* 1 = reply no prefix */
+ /* 2 = no reply, prefix */
+ /* 3 = reply & pefix */
+{
+ int r = 0;
+ char *cp,*cpesc,*cpnext,*cpend,*cpout;
+ char state,cset,newcset;
+ int reg,newreg;
+
+ cp = indata; /* JIS X 0201 -> ISO646 us-ascii */
+ cpend = cp + n - 1;
+ cpnext = cp;
+ if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal);
+ if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal);
+
+ if(!case_diffb(charset,11,"iso-2022-jp")) {
+ /* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
+ /* are from the rfc. Don't ask why they have multiple length G0 */
+ /* charset designations ... JIS X 0201-roman is identical to */
+ /* iso646 us-ascii except for currency and tilde. Making them the */
+ /* same increases hits without significant loss. JIS X 0208-1978 */
+ /* is superceded by JIS X 0208-1983 and converted here as well. */
+
+ while (cp < cpend) {
+ if (*cp++ != ESC) continue;
+ if (*cp == '(') {
+ if (++cp > cpend) break;
+ if (*cp == 'J') *cp = 'B';
+ ++cp;
+ } else if (*cp == '$') {
+ if (++cp > cpend) break;
+ if (*cp == '@') *cp = 'B';
+ ++cp;
+ }
+ }
+ /* eliminate redundant ESC seqs */
+ cp = indata;
+ cpnext = cp;
+ reg = 6;
+ while (cp < cpend) {
+ if (*cp++ != ESC) continue;
+ cpesc = cp - 1;
+ if (*cp == '$') {
+ if (++cp > cpend) break;
+ if (*cp == 'B') newreg = 87;
+ else if (*cp == 'A') newreg = 58;
+ else if (*cp == '(') {
+ if (++cp > cpend) break;
+ if (*cp == 'C') newreg = 149;
+ else if (*cp == 'D') newreg = 159;
+ else continue;
+ } else continue;
+ } else if (*cp == '(') {
+ if (++cp > cpend) break;
+ if (*cp == 'B') newreg = 6;
+ else continue;
+ } else continue;
+ if (++cp > cpend) break;
+ while (*cp == ' ' || *cp == '\t')
+ if (++cp >= cpend) break; /* skip space */
+ if (*cp == ESC) /* maybe another G0 designation */
+ if (*(cp+1) == '(' || *(cp+1) == '$') { /* yep! */
+ if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
+ cpnext = cp;
+ continue;
+ }
+ if (reg == newreg) {
+ if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
+ cpnext = cp;
+ } else {
+ reg = newreg;
+ } /* copy remainder of line */
+ }
+ if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
+ if (reg != 6) { /* need to return to us-ascii at the end of the line */
+ if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal);
+ } else { /* maybe "-Reply at the end?" */
+ r = trimend(tmpdata.s,&(tmpdata.len),fatal);
+ }
+
+ } else if (!case_diffb(charset,11,"iso-2022-cn") ||
+ !case_diffb(charset,11,"iso-2022-kr")) {
+ /* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
+ /* -cn-ext, 'x' can be a number of different letters. In -kr it's */
+ /* always 'C'. This routine may work also for other iso-2022 sets */
+ /* also handles iso-2022-cn-ext */
+ cpesc = (char *) 0; /* points to latest ESC */
+ state = SI; /* us-ascii */
+ --cp; /* set up for loop */
+
+ while (++cp <= cpend) {
+ if (*cp == SI || *cp == SO) {
+ if (state == *cp) { /* already in state. Skip shift seq */
+ if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal);
+ cpnext = cp;
+ } else /* set new state */
+ state = *cp;
+ if (++cp > cpend) break;
+ continue;
+ }
+ if (*cp != ESC) continue;
+ if (cp + 3 > cpend) break; /* not space for full SO-designation */
+ cpesc = cp;
+ if (*cp != '$') continue;
+ if (++cp > cpend) break;
+ if (*cp != ')') continue;
+ if (++cp > cpend) break;
+ newcset = *cp;
+ if (++cp > cpend) break;
+ while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp;
+ if (cp + 3 > cpend) break; /* no space for full SO-designation */
+ if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')')
+ || (newcset == cset)) {
+ /* skip if a second SO-designation right after or */
+ /* this SO-designation is already active, skip */
+ if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
+ --cp; /* "unpeek" so that next iteration will see char */
+ cpnext = cpesc + 4;
+ continue;
+ } else {
+ cset = newcset;
+ continue;
+ }
+ }
+ /* get remainder of line */
+ if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
+ if (state != SI) /* need to end in ascii */
+ if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal);
+ else /* ascii end; maybe "-Reply" at the end? */
+ r = trimend(tmpdata.s,&(tmpdata.len),fatal);
+
+ } else { /* other character sets = no special treatment */
+ r = trimend(cp,&n,fatal); /* -reply */
+ if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal);
+ }
+
+ cp = tmpdata.s;
+ n = tmpdata.len;
+ cpend = cp + n - 1;
+ if (flagtrimsub) { /* remove leading reply indicators & prefix*/
+ r |= trimre(&cp,cpend,prefix,fatal);
+ n = (unsigned int) (cpend-cp+1);
+ }
+ /* there shouldn't be '\0' or '\n', but make sure as */
+ /* it would break the message index */
+ if (!stralloc_copys(outdata,"")) die_nomem(fatal);
+ if (!stralloc_ready(outdata,n)) die_nomem(fatal);
+ outdata->len = n;
+ cpout = outdata->s;
+ while (n--) { /* '\n' and '\0' would break the subject index */
+ if (!*cp || *cp == '\n') *cpout = ' ';
+ else *cpout = *cp;
+ ++cp; ++cpout;
+ }
+ return r;
+}
+