X-Git-Url: https://git.distorted.org.uk/~mdw/ezmlm/blobdiff_plain/5b62e993b0af39700031c2875d7f6654e6a02850..f8beb284087c279acfb30506f5bb32baa4949b44:/unfoldHDR.c diff --git a/unfoldHDR.c b/unfoldHDR.c new file mode 100644 index 0000000..6004f0b --- /dev/null +++ b/unfoldHDR.c @@ -0,0 +1,303 @@ +/*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/ +/*$Name: ezmlm-idx-040 $*/ + +#include "stralloc.h" +#include "strerr.h" +#include "case.h" +#include "byte.h" +#include "errtxt.h" +#include "mime.h" + +static stralloc tmpdata = {0}; + +static int trimre(cpp,cpend,prefix,fatal) +char **cpp; +char *cpend; +stralloc *prefix; +char *fatal; + +{ + int r = 0; + register char *cp; + char *cpnew; + int junk; + unsigned int i,j; + unsigned int serial; + + cp = *cpp; + serial = prefix->len; /* pointer to serial number */ + if (serial) + serial = byte_rchr(prefix->s,prefix->len,'#'); + + junk = 1; + while (junk) { + junk = 0; + while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++; + cpnew = cp; + while (++cpnew <= cpend) { /* /(..+:\s)/ is a reply indicator */ + if (*cpnew == ' ') { + if (cpnew < cp + 3) break; /* at least 3 char before ' ' */ + if (*(cpnew - 1) != ':') break; /* require ':' before ' ' */ + if (cpnew > cp + 5) { /* if > 4 char before ':' require */ + register char ch; + ch = *(cpnew - 2); /* XX^3, XX[3], XX(3) */ + if (ch != ')' && ch != ']' && (ch < '0' || ch > '9')) + break; + } + junk = 1; + r |= 1; + cp = cpnew + 1; + break; + } + } + /* prefix removal is complicated by the inconsistent handling of ' ' */ + /* when there are rfc2047-encoded words in the subject. We first */ + /* compare prefix before "serial" ignoring space, then skip the */ + /* number, then compare after "serial". If both matched we've found */ + /* the prefix. */ + if (serial) { + cpnew = cp; + i = 0; + while (i < serial && cpnew <= cpend) { + if (*cpnew != ' ') { + if (prefix->s[i] == ' ') { + ++i; + continue; + } + if (*cpnew != prefix->s[i]) break; + ++i; + } + ++cpnew; + } + if (i == serial) { /* match before serial */ + j = prefix->len; + if (serial != j) { /* got a '#' */ + while (cpnew <= cpend && /* skip number/space */ + *cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew; + i = serial + 1; + while (i < j && cpnew <= cpend) { + if (*cpnew != ' ') { + if (prefix->s[i] == ' ') { + ++i; + continue; + } + if (*cpnew != prefix->s[i]) break; + ++i; + } + ++cpnew; + } + } + if (i == j) { + cp = cpnew; + junk = 1; + r |= 2; + } + } + } + } + *cpp = cp; + return r; +} + +static int trimend(indata,np,fatal) +char *indata; +unsigned int *np; +char *fatal; + /* looks at indata of length n from the end removing LWSP & '\n' */ + /* and any trailing '-Reply'. Sets n to new length and returns: */ + /* 0 - not reply, 1 - reply. */ +{ + char *cplast; + int junk; + int r = 0; + + if (*np == 0) return 0; + cplast = indata + *np - 1; /* points to last char on line */ + junk = 1; + while (junk) { + junk = 0; + while (cplast >= indata && + (*cplast == ' ' || *cplast == '\t' || + *cplast == '\r' || *cplast == '\n')) + --cplast; + if (cplast - indata >= 5 && case_startb(cplast - 5,6,"-Reply")) { + cplast -= 6; + r = 1; + junk = 1; + } + } + *np = (unsigned int) (cplast - indata + 1); /* new length */ + return r; +} + +int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal) +char *indata; +unsigned int n; +stralloc *outdata; +char *charset; +stralloc *prefix; +int flagtrimsub; +char *fatal; + /* takes a header as indata. Removal of reply-indicators is done */ + /* but removal of line breaks and Q and B decoding should have */ + /* been done. Returns a */ + /* single line header without trailing \n or \0. Mainly, we */ + /* remove redundant shift codes */ + /* returns 0 = no reply no prefix */ + /* 1 = reply no prefix */ + /* 2 = no reply, prefix */ + /* 3 = reply & pefix */ +{ + int r = 0; + char *cp,*cpesc,*cpnext,*cpend,*cpout; + char state,cset,newcset; + int reg,newreg; + + cp = indata; /* JIS X 0201 -> ISO646 us-ascii */ + cpend = cp + n - 1; + cpnext = cp; + if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal); + if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal); + + if(!case_diffb(charset,11,"iso-2022-jp")) { + /* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */ + /* are from the rfc. Don't ask why they have multiple length G0 */ + /* charset designations ... JIS X 0201-roman is identical to */ + /* iso646 us-ascii except for currency and tilde. Making them the */ + /* same increases hits without significant loss. JIS X 0208-1978 */ + /* is superceded by JIS X 0208-1983 and converted here as well. */ + + while (cp < cpend) { + if (*cp++ != ESC) continue; + if (*cp == '(') { + if (++cp > cpend) break; + if (*cp == 'J') *cp = 'B'; + ++cp; + } else if (*cp == '$') { + if (++cp > cpend) break; + if (*cp == '@') *cp = 'B'; + ++cp; + } + } + /* eliminate redundant ESC seqs */ + cp = indata; + cpnext = cp; + reg = 6; + while (cp < cpend) { + if (*cp++ != ESC) continue; + cpesc = cp - 1; + if (*cp == '$') { + if (++cp > cpend) break; + if (*cp == 'B') newreg = 87; + else if (*cp == 'A') newreg = 58; + else if (*cp == '(') { + if (++cp > cpend) break; + if (*cp == 'C') newreg = 149; + else if (*cp == 'D') newreg = 159; + else continue; + } else continue; + } else if (*cp == '(') { + if (++cp > cpend) break; + if (*cp == 'B') newreg = 6; + else continue; + } else continue; + if (++cp > cpend) break; + while (*cp == ' ' || *cp == '\t') + if (++cp >= cpend) break; /* skip space */ + if (*cp == ESC) /* maybe another G0 designation */ + if (*(cp+1) == '(' || *(cp+1) == '$') { /* yep! */ + if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal); + cpnext = cp; + continue; + } + if (reg == newreg) { + if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal); + cpnext = cp; + } else { + reg = newreg; + } /* copy remainder of line */ + } + if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal); + if (reg != 6) { /* need to return to us-ascii at the end of the line */ + if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal); + } else { /* maybe "-Reply at the end?" */ + r = trimend(tmpdata.s,&(tmpdata.len),fatal); + } + + } else if (!case_diffb(charset,11,"iso-2022-cn") || + !case_diffb(charset,11,"iso-2022-kr")) { + /* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */ + /* -cn-ext, 'x' can be a number of different letters. In -kr it's */ + /* always 'C'. This routine may work also for other iso-2022 sets */ + /* also handles iso-2022-cn-ext */ + cpesc = (char *) 0; /* points to latest ESC */ + state = SI; /* us-ascii */ + --cp; /* set up for loop */ + + while (++cp <= cpend) { + if (*cp == SI || *cp == SO) { + if (state == *cp) { /* already in state. Skip shift seq */ + if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal); + cpnext = cp; + } else /* set new state */ + state = *cp; + if (++cp > cpend) break; + continue; + } + if (*cp != ESC) continue; + if (cp + 3 > cpend) break; /* not space for full SO-designation */ + cpesc = cp; + if (*cp != '$') continue; + if (++cp > cpend) break; + if (*cp != ')') continue; + if (++cp > cpend) break; + newcset = *cp; + if (++cp > cpend) break; + while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp; + if (cp + 3 > cpend) break; /* no space for full SO-designation */ + if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')') + || (newcset == cset)) { + /* skip if a second SO-designation right after or */ + /* this SO-designation is already active, skip */ + if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal); + --cp; /* "unpeek" so that next iteration will see char */ + cpnext = cpesc + 4; + continue; + } else { + cset = newcset; + continue; + } + } + /* get remainder of line */ + if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal); + if (state != SI) /* need to end in ascii */ + if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal); + else /* ascii end; maybe "-Reply" at the end? */ + r = trimend(tmpdata.s,&(tmpdata.len),fatal); + + } else { /* other character sets = no special treatment */ + r = trimend(cp,&n,fatal); /* -reply */ + if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal); + } + + cp = tmpdata.s; + n = tmpdata.len; + cpend = cp + n - 1; + if (flagtrimsub) { /* remove leading reply indicators & prefix*/ + r |= trimre(&cp,cpend,prefix,fatal); + n = (unsigned int) (cpend-cp+1); + } + /* there shouldn't be '\0' or '\n', but make sure as */ + /* it would break the message index */ + if (!stralloc_copys(outdata,"")) die_nomem(fatal); + if (!stralloc_ready(outdata,n)) die_nomem(fatal); + outdata->len = n; + cpout = outdata->s; + while (n--) { /* '\n' and '\0' would break the subject index */ + if (!*cp || *cp == '\n') *cpout = ' '; + else *cpout = *cp; + ++cp; ++cpout; + } + return r; +} +