[ezmlm] / unfoldHDR.c

/*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/
/*$Name: ezmlm-idx-040 $*/

#include "stralloc.h"
#include "strerr.h"
#include "case.h"
#include "byte.h"
#include "errtxt.h"
#include "mime.h"

static stralloc tmpdata = {0};

static int trimre(cpp,cpend,prefix,fatal)
char **cpp;
char *cpend;
stralloc *prefix;
char *fatal;

{
  int r = 0;
  register char *cp;
  char *cpnew;
  int junk;
  unsigned int i,j;
  unsigned int serial;

  cp = *cpp;
  serial = prefix->len;		/* pointer to serial number */
  if (serial)
    serial = byte_rchr(prefix->s,prefix->len,'#');

  junk = 1;
  while (junk) {
    junk = 0;
    while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++;
    cpnew = cp;
    while (++cpnew <= cpend) {	/* /(..+:\s)/ is a reply indicator */
      if (*cpnew == ' ') {
        if (cpnew < cp + 3) break;	/* at least 3 char before ' ' */
	if (*(cpnew - 1) != ':') break;	/* require ':' before ' ' */
	if (cpnew > cp + 5) {		/* if > 4 char before ':' require */
	  register char ch;
	  ch = *(cpnew - 2);		/* XX^3, XX[3], XX(3) */
	  if (ch != ')' && ch != ']' && (ch < '0' || ch > '9'))
	    break;
	}
	junk = 1;
	r |= 1;
	cp = cpnew + 1;
        break;
      }
    }
	/* prefix removal is complicated by the inconsistent handling of ' ' */
	/* when there are rfc2047-encoded words in the subject. We first     */
	/* compare prefix before "serial" ignoring space, then skip the      */
	/* number, then compare after "serial". If both matched we've found  */
	/* the prefix. */
    if (serial) {
      cpnew = cp;
      i = 0;
      while (i < serial && cpnew <= cpend) {
        if (*cpnew != ' ') {
          if (prefix->s[i] == ' ') {
            ++i;
            continue;
          }
          if (*cpnew != prefix->s[i]) break;
          ++i;
        }
        ++cpnew;
      }
      if (i == serial) {		/* match before serial */
        j = prefix->len;
        if (serial != j) {		/* got a '#' */
          while (cpnew <= cpend &&	/* skip number/space */
		*cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew;
          i = serial + 1;
          while (i < j && cpnew <= cpend) {
            if (*cpnew != ' ') {
              if (prefix->s[i] == ' ') {
                ++i;
                continue;
              }
              if (*cpnew != prefix->s[i]) break;
              ++i;
            }
            ++cpnew;
          }
        }
        if (i == j) {
          cp = cpnew;
          junk = 1;
          r |= 2;
        }
      }
    }
  }
  *cpp = cp;
  return r;
}

static int trimend(indata,np,fatal)
char *indata;
unsigned int *np;
char *fatal;
	/* looks at indata of length n from the end removing LWSP & '\n' */
	/* and any trailing '-Reply'. Sets n to new length and returns:  */
	/* 0 - not reply, 1 - reply. */
{
  char *cplast;
  int junk;
  int r = 0;

  if (*np == 0) return 0;
  cplast = indata + *np - 1;	/* points to last char on line */
  junk = 1;
  while (junk) {
    junk = 0;
    while (cplast >= indata &&
             (*cplast == ' ' || *cplast == '\t' ||
              *cplast == '\r' || *cplast == '\n')) 
            --cplast;
    if (cplast - indata  >= 5 && case_startb(cplast - 5,6,"-Reply")) {
      cplast -= 6;
      r = 1;
      junk = 1;
    }
  }
  *np = (unsigned int) (cplast - indata + 1);	/* new length */
  return r;
}

int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal)
char *indata;
unsigned int n;
stralloc *outdata;
char *charset;
stralloc *prefix;
int flagtrimsub;
char *fatal;
	/* takes a header as indata. Removal of reply-indicators is done */
	/* but removal of line breaks and Q and B decoding should have   */
	/* been done. Returns a */
	/* single line header without trailing \n or \0. Mainly, we      */
	/* remove redundant shift codes   */
	/* returns 0 = no reply no prefix */
	/*         1 = reply no prefix    */
	/*         2 = no reply, prefix   */
	/*         3 = reply & pefix      */
{
  int r = 0;
  char *cp,*cpesc,*cpnext,*cpend,*cpout;
  char state,cset,newcset;
  int reg,newreg;

  cp = indata;		/* JIS X 0201 -> ISO646 us-ascii */
  cpend = cp + n - 1;
  cpnext = cp;
  if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal);
  if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal);

  if(!case_diffb(charset,11,"iso-2022-jp")) {
	/* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
	/* are from the rfc. Don't ask why they have multiple length G0   */
	/* charset designations ... JIS X 0201-roman is identical to      */
	/* iso646 us-ascii except for currency and tilde. Making them the */
	/* same increases hits without significant loss. JIS X 0208-1978  */
	/* is superceded by JIS X 0208-1983 and converted here as well.   */

    while (cp < cpend) {
      if (*cp++ != ESC) continue;
      if (*cp == '(') {
        if (++cp > cpend) break;
        if (*cp == 'J') *cp = 'B';
        ++cp;
      } else if (*cp == '$') {
        if (++cp > cpend) break;
        if (*cp == '@') *cp = 'B';
        ++cp;
      }
    }
		/* eliminate redundant ESC seqs */
    cp = indata;
    cpnext = cp;
    reg = 6;
    while (cp < cpend) {
      if (*cp++ != ESC) continue;
      cpesc = cp - 1;
      if (*cp == '$') {
        if (++cp > cpend) break;
        if (*cp == 'B') newreg = 87;
        else if (*cp == 'A') newreg = 58;
        else if (*cp == '(') {
          if (++cp > cpend) break;
          if (*cp == 'C') newreg = 149;
          else if (*cp == 'D') newreg = 159;
          else continue;
        } else continue;
      } else if (*cp == '(') {
        if (++cp > cpend) break;
        if (*cp == 'B') newreg = 6;
        else continue;
      } else continue;
      if (++cp > cpend) break;
      while (*cp == ' ' || *cp == '\t')
        if (++cp >= cpend) break;	/* skip space */
      if (*cp == ESC)			/* maybe another G0 designation */
        if (*(cp+1) == '(' || *(cp+1) == '$') {	 /* yep! */
          if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
          cpnext = cp;
	  continue;
      }
      if (reg == newreg) {
        if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
        cpnext = cp;
      } else {
        reg = newreg;
      }		/* copy remainder of line */
    }
    if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
    if (reg != 6) {	/* need to return to us-ascii at the end of the line */
      if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal);
    } else {		/* maybe "-Reply at the end?" */
      r = trimend(tmpdata.s,&(tmpdata.len),fatal);
    }

  } else if (!case_diffb(charset,11,"iso-2022-cn") ||
             !case_diffb(charset,11,"iso-2022-kr")) {
	/* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
	/* -cn-ext, 'x' can be a number of different letters. In -kr it's  */
	/* always 'C'. This routine may work also for other iso-2022 sets  */
	/* also handles iso-2022-cn-ext */
    cpesc = (char *) 0;	/* points to latest ESC */
    state = SI;		/* us-ascii */
    --cp;		/* set up for loop */

    while (++cp <= cpend) {
      if (*cp == SI || *cp == SO) {
        if (state == *cp) {		 /* already in state. Skip shift seq */
          if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal);
          cpnext = cp;
        } else				/* set new state */
          state = *cp;
        if (++cp > cpend) break;
        continue;
      }
      if (*cp != ESC) continue;
      if (cp + 3 > cpend) break;	/* not space for full SO-designation */
      cpesc = cp;
      if (*cp != '$') continue;
      if (++cp > cpend) break;
      if (*cp != ')') continue;
      if (++cp > cpend) break;
      newcset = *cp;
      if (++cp > cpend) break;
      while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp;
      if (cp + 3 > cpend) break;	/* no space for full SO-designation */
      if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')')
		|| (newcset == cset)) {
			/* skip if a second SO-designation right after or */
			/* this SO-designation is already active, skip */
        if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
        --cp;		/* "unpeek" so that next iteration will see char */
        cpnext = cpesc + 4;
        continue;
      } else {
        cset = newcset;
        continue;
      }
    }
			/* get remainder of line */
    if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
    if (state != SI)	/* need to end in ascii */
      if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal);
    else		/* ascii end; maybe "-Reply" at the end? */
      r = trimend(tmpdata.s,&(tmpdata.len),fatal);

  } else {		/* other character sets = no special treatment */
    r = trimend(cp,&n,fatal);		/* -reply */
    if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal);
  }

  cp = tmpdata.s;
  n = tmpdata.len;
  cpend = cp + n - 1;
  if (flagtrimsub) {	 /* remove leading reply indicators & prefix*/
    r |= trimre(&cp,cpend,prefix,fatal);
    n = (unsigned int) (cpend-cp+1);
  }
			/* there shouldn't be '\0' or '\n', but make sure as */
			/* it would break the message index */
  if (!stralloc_copys(outdata,"")) die_nomem(fatal);
  if (!stralloc_ready(outdata,n)) die_nomem(fatal);
  outdata->len = n;
  cpout = outdata->s;
  while (n--) {		/* '\n' and '\0' would break the subject index */
    if (!*cp || *cp == '\n') *cpout = ' ';
    else *cpout = *cp;
    ++cp; ++cpout;
  }
  return r;
}
Commit	Line	Data
f8beb284 MW	1	/$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $/
	2	/$Name: ezmlm-idx-040 $/
	3
	4	#include "stralloc.h"
	5	#include "strerr.h"
	6	#include "case.h"
	7	#include "byte.h"
	8	#include "errtxt.h"
	9	#include "mime.h"
	10
	11	static stralloc tmpdata = {0};
	12
	13	static int trimre(cpp,cpend,prefix,fatal)
	14	char **cpp;
	15	char *cpend;
	16	stralloc *prefix;
	17	char *fatal;
	18
	19	{
	20	int r = 0;
	21	register char *cp;
	22	char *cpnew;
	23	int junk;
	24	unsigned int i,j;
	25	unsigned int serial;
	26
	27	cp = *cpp;
	28	serial = prefix->len; /* pointer to serial number */
	29	if (serial)
	30	serial = byte_rchr(prefix->s,prefix->len,'#');
	31
	32	junk = 1;
	33	while (junk) {
	34	junk = 0;
	35	while (cp <= cpend && (cp == ' ' \|\| cp == '\t')) cp++;
	36	cpnew = cp;
	37	while (++cpnew <= cpend) { /* /(..+:\s)/ is a reply indicator */
	38	if (*cpnew == ' ') {
	39	if (cpnew < cp + 3) break; /* at least 3 char before ' ' */
	40	if ((cpnew - 1) != ':') break; / require ':' before ' ' */
	41	if (cpnew > cp + 5) { /* if > 4 char before ':' require */
	42	register char ch;
	43	ch = (cpnew - 2); / XX^3, XX[3], XX(3) */
	44	if (ch != ')' && ch != ']' && (ch < '0' \|\| ch > '9'))
	45	break;
	46	}
	47	junk = 1;
	48	r \|= 1;
	49	cp = cpnew + 1;
	50	break;
	51	}
	52	}
	53	/* prefix removal is complicated by the inconsistent handling of ' ' */
	54	/* when there are rfc2047-encoded words in the subject. We first */
	55	/* compare prefix before "serial" ignoring space, then skip the */
	56	/* number, then compare after "serial". If both matched we've found */
	57	/* the prefix. */
	58	if (serial) {
	59	cpnew = cp;
	60	i = 0;
	61	while (i < serial && cpnew <= cpend) {
	62	if (*cpnew != ' ') {
	63	if (prefix->s[i] == ' ') {
	64	++i;
65	continue;
66	}
67	if (*cpnew != prefix->s[i]) break;
68	++i;
69	}
70	++cpnew;
71	}
72	if (i == serial) { /* match before serial */
73	j = prefix->len;
74	if (serial != j) { /* got a '#' */
75	while (cpnew <= cpend && /* skip number/space */
76	cpnew == ' ' \|\| (cpnew <= '9' && *cpnew >= '0')) ++cpnew;
77	i = serial + 1;
78	while (i < j && cpnew <= cpend) {
79	if (*cpnew != ' ') {
80	if (prefix->s[i] == ' ') {
81	++i;
82	continue;
83	}
84	if (*cpnew != prefix->s[i]) break;
85	++i;
86	}
87	++cpnew;
88	}
89	}
90	if (i == j) {
91	cp = cpnew;
92	junk = 1;
93	r \|= 2;
94	}
95	}
96	}
97	}
98	*cpp = cp;
99	return r;
100	}
101
102	static int trimend(indata,np,fatal)
103	char *indata;
104	unsigned int *np;
105	char *fatal;
106	/* looks at indata of length n from the end removing LWSP & '\n' */
107	/* and any trailing '-Reply'. Sets n to new length and returns: */
108	/* 0 - not reply, 1 - reply. */
109	{
110	char *cplast;
111	int junk;
112	int r = 0;
113
114	if (*np == 0) return 0;
115	cplast = indata + np - 1; / points to last char on line */
116	junk = 1;
117	while (junk) {
118	junk = 0;
119	while (cplast >= indata &&
120	(cplast == ' ' \|\| cplast == '\t' \|\|
121	cplast == '\r' \|\| cplast == '\n'))
122	--cplast;
123	if (cplast - indata >= 5 && case_startb(cplast - 5,6,"-Reply")) {
124	cplast -= 6;
125	r = 1;
126	junk = 1;
127	}
128	}
129	np = (unsigned int) (cplast - indata + 1); / new length */
130	return r;
131	}
132
133	int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal)
134	char *indata;
135	unsigned int n;
136	stralloc *outdata;
137	char *charset;
138	stralloc *prefix;
139	int flagtrimsub;
140	char *fatal;
141	/* takes a header as indata. Removal of reply-indicators is done */
142	/* but removal of line breaks and Q and B decoding should have */
143	/* been done. Returns a */
144	/* single line header without trailing \n or \0. Mainly, we */
145	/* remove redundant shift codes */
146	/* returns 0 = no reply no prefix */
147	/* 1 = reply no prefix */
148	/* 2 = no reply, prefix */
149	/* 3 = reply & pefix */
150	{
151	int r = 0;
152	char cp,cpesc,cpnext,cpend,*cpout;
153	char state,cset,newcset;
154	int reg,newreg;
155
156	cp = indata; /* JIS X 0201 -> ISO646 us-ascii */
157	cpend = cp + n - 1;
158	cpnext = cp;
159	if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal);
160	if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal);
161
162	if(!case_diffb(charset,11,"iso-2022-jp")) {
163	/* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
164	/* are from the rfc. Don't ask why they have multiple length G0 */
165	/* charset designations ... JIS X 0201-roman is identical to */
166	/* iso646 us-ascii except for currency and tilde. Making them the */
167	/* same increases hits without significant loss. JIS X 0208-1978 */
168	/* is superceded by JIS X 0208-1983 and converted here as well. */
169
170	while (cp < cpend) {
171	if (*cp++ != ESC) continue;
172	if (*cp == '(') {
173	if (++cp > cpend) break;
174	if (cp == 'J') cp = 'B';
175	++cp;
176	} else if (*cp == '$') {
177	if (++cp > cpend) break;
178	if (cp == '@') cp = 'B';
179	++cp;
180	}
181	}
182	/* eliminate redundant ESC seqs */
183	cp = indata;
184	cpnext = cp;
185	reg = 6;
186	while (cp < cpend) {
187	if (*cp++ != ESC) continue;
188	cpesc = cp - 1;
189	if (*cp == '$') {
190	if (++cp > cpend) break;
191	if (*cp == 'B') newreg = 87;
192	else if (*cp == 'A') newreg = 58;
193	else if (*cp == '(') {
194	if (++cp > cpend) break;
195	if (*cp == 'C') newreg = 149;
196	else if (*cp == 'D') newreg = 159;
197	else continue;
198	} else continue;
199	} else if (*cp == '(') {
200	if (++cp > cpend) break;
201	if (*cp == 'B') newreg = 6;
202	else continue;
203	} else continue;
204	if (++cp > cpend) break;
205	while (cp == ' ' \|\| cp == '\t')
206	if (++cp >= cpend) break; /* skip space */
207	if (cp == ESC) / maybe another G0 designation */
208	if ((cp+1) == '(' \|\| (cp+1) == '$') { /* yep! */
209	if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
210	cpnext = cp;
211	continue;
212	}
213	if (reg == newreg) {
214	if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
215	cpnext = cp;
216	} else {
217	reg = newreg;
218	} /* copy remainder of line */
219	}
220	if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
221	if (reg != 6) { /* need to return to us-ascii at the end of the line */
222	if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal);
223	} else { /* maybe "-Reply at the end?" */
224	r = trimend(tmpdata.s,&(tmpdata.len),fatal);
225	}
226
227	} else if (!case_diffb(charset,11,"iso-2022-cn") \|\|
228	!case_diffb(charset,11,"iso-2022-kr")) {
229	/* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
230	/* -cn-ext, 'x' can be a number of different letters. In -kr it's */
231	/* always 'C'. This routine may work also for other iso-2022 sets */
232	/* also handles iso-2022-cn-ext */
233	cpesc = (char ) 0; / points to latest ESC */
234	state = SI; /* us-ascii */
235	--cp; /* set up for loop */
236
237	while (++cp <= cpend) {
238	if (cp == SI \|\| cp == SO) {
239	if (state == cp) { / already in state. Skip shift seq */
240	if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal);
241	cpnext = cp;
242	} else /* set new state */
243	state = *cp;
244	if (++cp > cpend) break;
245	continue;
246	}
247	if (*cp != ESC) continue;
248	if (cp + 3 > cpend) break; /* not space for full SO-designation */
249	cpesc = cp;
250	if (*cp != '$') continue;
251	if (++cp > cpend) break;
252	if (*cp != ')') continue;
253	if (++cp > cpend) break;
254	newcset = *cp;
255	if (++cp > cpend) break;
256	while (cp <= cpend && (cp == ' ' \|\| cp == '\t')) ++cp;
257	if (cp + 3 > cpend) break; /* no space for full SO-designation */
258	if ((cp == ESC && (cp+1) == '$' && *(cp+2) == ')')
259	\|\| (newcset == cset)) {
260	/* skip if a second SO-designation right after or */
261	/* this SO-designation is already active, skip */
262	if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
263	--cp; /* "unpeek" so that next iteration will see char */
264	cpnext = cpesc + 4;
265	continue;
266	} else {
267	cset = newcset;
268	continue;
269	}
270	}
271	/* get remainder of line */
272	if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
273	if (state != SI) /* need to end in ascii */
274	if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal);
275	else /* ascii end; maybe "-Reply" at the end? */
276	r = trimend(tmpdata.s,&(tmpdata.len),fatal);
277
278	} else { /* other character sets = no special treatment */
279	r = trimend(cp,&n,fatal); /* -reply */
280	if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal);
281	}
282
283	cp = tmpdata.s;
284	n = tmpdata.len;
285	cpend = cp + n - 1;
286	if (flagtrimsub) { /* remove leading reply indicators & prefix*/
287	r \|= trimre(&cp,cpend,prefix,fatal);
288	n = (unsigned int) (cpend-cp+1);
289	}
290	/* there shouldn't be '\0' or '\n', but make sure as */
291	/* it would break the message index */
292	if (!stralloc_copys(outdata,"")) die_nomem(fatal);
293	if (!stralloc_ready(outdata,n)) die_nomem(fatal);
294	outdata->len = n;
295	cpout = outdata->s;
296	while (n--) { /* '\n' and '\0' would break the subject index */
297	if (!cp \|\| cp == '\n') *cpout = ' ';
298	else cpout = cp;
299	++cp; ++cpout;
300	}
301	return r;
302	}
303