mdw@git.distorted.org.uk Git - ezmlm/blame_incremental

... / ...

Commit	Line	Data
	1	/$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $/
	2	/$Name: ezmlm-idx-040 $/
	3
	4	#include "stralloc.h"
	5	#include "strerr.h"
	6	#include "case.h"
	7	#include "byte.h"
	8	#include "errtxt.h"
	9	#include "mime.h"
	10
	11	static stralloc tmpdata = {0};
	12
	13	static int trimre(cpp,cpend,prefix,fatal)
	14	char **cpp;
	15	char *cpend;
	16	stralloc *prefix;
	17	char *fatal;
	18
	19	{
	20	int r = 0;
	21	register char *cp;
	22	char *cpnew;
	23	int junk;
	24	unsigned int i,j;
	25	unsigned int serial;
	26
	27	cp = *cpp;
	28	serial = prefix->len; /* pointer to serial number */
	29	if (serial)
	30	serial = byte_rchr(prefix->s,prefix->len,'#');
	31
	32	junk = 1;
	33	while (junk) {
	34	junk = 0;
	35	while (cp <= cpend && (cp == ' ' \|\| cp == '\t')) cp++;
	36	cpnew = cp;
	37	while (++cpnew <= cpend) { /* /(..+:\s)/ is a reply indicator */
	38	if (*cpnew == ' ') {
	39	if (cpnew < cp + 3) break; /* at least 3 char before ' ' */
	40	if ((cpnew - 1) != ':') break; / require ':' before ' ' */
	41	if (cpnew > cp + 5) { /* if > 4 char before ':' require */
	42	register char ch;
	43	ch = (cpnew - 2); / XX^3, XX[3], XX(3) */
	44	if (ch != ')' && ch != ']' && (ch < '0' \|\| ch > '9'))
	45	break;
	46	}
	47	junk = 1;
	48	r \|= 1;
	49	cp = cpnew + 1;
	50	break;
	51	}
	52	}
	53	/* prefix removal is complicated by the inconsistent handling of ' ' */
	54	/* when there are rfc2047-encoded words in the subject. We first */
	55	/* compare prefix before "serial" ignoring space, then skip the */
	56	/* number, then compare after "serial". If both matched we've found */
	57	/* the prefix. */
	58	if (serial) {
	59	cpnew = cp;
	60	i = 0;
	61	while (i < serial && cpnew <= cpend) {
	62	if (*cpnew != ' ') {
	63	if (prefix->s[i] == ' ') {
	64	++i;
	65	continue;
	66	}
	67	if (*cpnew != prefix->s[i]) break;
	68	++i;
	69	}
	70	++cpnew;
	71	}
	72	if (i == serial) { /* match before serial */
	73	j = prefix->len;
	74	if (serial != j) { /* got a '#' */
	75	while (cpnew <= cpend && /* skip number/space */
	76	cpnew == ' ' \|\| (cpnew <= '9' && *cpnew >= '0')) ++cpnew;
	77	i = serial + 1;
	78	while (i < j && cpnew <= cpend) {
	79	if (*cpnew != ' ') {
	80	if (prefix->s[i] == ' ') {
	81	++i;
	82	continue;
	83	}
	84	if (*cpnew != prefix->s[i]) break;
	85	++i;
	86	}
	87	++cpnew;
	88	}
	89	}
	90	if (i == j) {
	91	cp = cpnew;
	92	junk = 1;
	93	r \|= 2;
	94	}
	95	}
	96	}
	97	}
	98	*cpp = cp;
	99	return r;
	100	}
	101
	102	static int trimend(indata,np,fatal)
	103	char *indata;
	104	unsigned int *np;
	105	char *fatal;
	106	/* looks at indata of length n from the end removing LWSP & '\n' */
	107	/* and any trailing '-Reply'. Sets n to new length and returns: */
	108	/* 0 - not reply, 1 - reply. */
	109	{
	110	char *cplast;
	111	int junk;
	112	int r = 0;
	113
	114	if (*np == 0) return 0;
	115	cplast = indata + np - 1; / points to last char on line */
	116	junk = 1;
	117	while (junk) {
	118	junk = 0;
	119	while (cplast >= indata &&
	120	(cplast == ' ' \|\| cplast == '\t' \|\|
	121	cplast == '\r' \|\| cplast == '\n'))
	122	--cplast;
	123	if (cplast - indata >= 5 && case_startb(cplast - 5,6,"-Reply")) {
	124	cplast -= 6;
	125	r = 1;
	126	junk = 1;
	127	}
	128	}
	129	np = (unsigned int) (cplast - indata + 1); / new length */
	130	return r;
	131	}
	132
	133	int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal)
	134	char *indata;
	135	unsigned int n;
	136	stralloc *outdata;
	137	char *charset;
	138	stralloc *prefix;
	139	int flagtrimsub;
	140	char *fatal;
	141	/* takes a header as indata. Removal of reply-indicators is done */
	142	/* but removal of line breaks and Q and B decoding should have */
	143	/* been done. Returns a */
	144	/* single line header without trailing \n or \0. Mainly, we */
	145	/* remove redundant shift codes */
	146	/* returns 0 = no reply no prefix */
	147	/* 1 = reply no prefix */
	148	/* 2 = no reply, prefix */
	149	/* 3 = reply & pefix */
	150	{
	151	int r = 0;
	152	char cp,cpesc,cpnext,cpend,*cpout;
	153	char state,cset,newcset;
	154	int reg,newreg;
	155
	156	cp = indata; /* JIS X 0201 -> ISO646 us-ascii */
	157	cpend = cp + n - 1;
	158	cpnext = cp;
	159	if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal);
	160	if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal);
	161
	162	if(!case_diffb(charset,11,"iso-2022-jp")) {
	163	/* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
	164	/* are from the rfc. Don't ask why they have multiple length G0 */
	165	/* charset designations ... JIS X 0201-roman is identical to */
	166	/* iso646 us-ascii except for currency and tilde. Making them the */
	167	/* same increases hits without significant loss. JIS X 0208-1978 */
	168	/* is superceded by JIS X 0208-1983 and converted here as well. */
	169
	170	while (cp < cpend) {
	171	if (*cp++ != ESC) continue;
	172	if (*cp == '(') {
	173	if (++cp > cpend) break;
	174	if (cp == 'J') cp = 'B';
	175	++cp;
	176	} else if (*cp == '$') {
	177	if (++cp > cpend) break;
	178	if (cp == '@') cp = 'B';
	179	++cp;
	180	}
	181	}
	182	/* eliminate redundant ESC seqs */
	183	cp = indata;
	184	cpnext = cp;
	185	reg = 6;
	186	while (cp < cpend) {
	187	if (*cp++ != ESC) continue;
	188	cpesc = cp - 1;
	189	if (*cp == '$') {
	190	if (++cp > cpend) break;
	191	if (*cp == 'B') newreg = 87;
	192	else if (*cp == 'A') newreg = 58;
	193	else if (*cp == '(') {
	194	if (++cp > cpend) break;
	195	if (*cp == 'C') newreg = 149;
	196	else if (*cp == 'D') newreg = 159;
	197	else continue;
	198	} else continue;
	199	} else if (*cp == '(') {
	200	if (++cp > cpend) break;
	201	if (*cp == 'B') newreg = 6;
	202	else continue;
	203	} else continue;
	204	if (++cp > cpend) break;
	205	while (cp == ' ' \|\| cp == '\t')
	206	if (++cp >= cpend) break; /* skip space */
	207	if (cp == ESC) / maybe another G0 designation */
	208	if ((cp+1) == '(' \|\| (cp+1) == '$') { /* yep! */
	209	if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
	210	cpnext = cp;
	211	continue;
	212	}
	213	if (reg == newreg) {
	214	if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
	215	cpnext = cp;
	216	} else {
	217	reg = newreg;
	218	} /* copy remainder of line */
	219	}
	220	if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
	221	if (reg != 6) { /* need to return to us-ascii at the end of the line */
	222	if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal);
	223	} else { /* maybe "-Reply at the end?" */
	224	r = trimend(tmpdata.s,&(tmpdata.len),fatal);
	225	}
	226
	227	} else if (!case_diffb(charset,11,"iso-2022-cn") \|\|
	228	!case_diffb(charset,11,"iso-2022-kr")) {
	229	/* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
	230	/* -cn-ext, 'x' can be a number of different letters. In -kr it's */
	231	/* always 'C'. This routine may work also for other iso-2022 sets */
	232	/* also handles iso-2022-cn-ext */
	233	cpesc = (char ) 0; / points to latest ESC */
	234	state = SI; /* us-ascii */
	235	--cp; /* set up for loop */
	236
	237	while (++cp <= cpend) {
	238	if (cp == SI \|\| cp == SO) {
	239	if (state == cp) { / already in state. Skip shift seq */
	240	if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal);
	241	cpnext = cp;
	242	} else /* set new state */
	243	state = *cp;
	244	if (++cp > cpend) break;
	245	continue;
	246	}
	247	if (*cp != ESC) continue;
	248	if (cp + 3 > cpend) break; /* not space for full SO-designation */
	249	cpesc = cp;
	250	if (*cp != '$') continue;
	251	if (++cp > cpend) break;
	252	if (*cp != ')') continue;
	253	if (++cp > cpend) break;
	254	newcset = *cp;
	255	if (++cp > cpend) break;
	256	while (cp <= cpend && (cp == ' ' \|\| cp == '\t')) ++cp;
	257	if (cp + 3 > cpend) break; /* no space for full SO-designation */
	258	if ((cp == ESC && (cp+1) == '$' && *(cp+2) == ')')
	259	\|\| (newcset == cset)) {
	260	/* skip if a second SO-designation right after or */
	261	/* this SO-designation is already active, skip */
	262	if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
	263	--cp; /* "unpeek" so that next iteration will see char */
	264	cpnext = cpesc + 4;
	265	continue;
	266	} else {
	267	cset = newcset;
	268	continue;
	269	}
	270	}
	271	/* get remainder of line */
	272	if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
	273	if (state != SI) /* need to end in ascii */
	274	if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal);
	275	else /* ascii end; maybe "-Reply" at the end? */
	276	r = trimend(tmpdata.s,&(tmpdata.len),fatal);
	277
	278	} else { /* other character sets = no special treatment */
	279	r = trimend(cp,&n,fatal); /* -reply */
	280	if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal);
	281	}
	282
	283	cp = tmpdata.s;
	284	n = tmpdata.len;
	285	cpend = cp + n - 1;
	286	if (flagtrimsub) { /* remove leading reply indicators & prefix*/
	287	r \|= trimre(&cp,cpend,prefix,fatal);
	288	n = (unsigned int) (cpend-cp+1);
	289	}
	290	/* there shouldn't be '\0' or '\n', but make sure as */
	291	/* it would break the message index */
	292	if (!stralloc_copys(outdata,"")) die_nomem(fatal);
	293	if (!stralloc_ready(outdata,n)) die_nomem(fatal);
	294	outdata->len = n;
	295	cpout = outdata->s;
	296	while (n--) { /* '\n' and '\0' would break the subject index */
	297	if (!cp \|\| cp == '\n') *cpout = ' ';
	298	else cpout = cp;
	299	++cp; ++cpout;
	300	}
	301	return r;
	302	}
	303