Import ezmlm-idx 0.40
[ezmlm] / unfoldHDR.c
1 /*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/
2 /*$Name: ezmlm-idx-040 $*/
3
4 #include "stralloc.h"
5 #include "strerr.h"
6 #include "case.h"
7 #include "byte.h"
8 #include "errtxt.h"
9 #include "mime.h"
10
11 static stralloc tmpdata = {0};
12
13 static int trimre(cpp,cpend,prefix,fatal)
14 char **cpp;
15 char *cpend;
16 stralloc *prefix;
17 char *fatal;
18
19 {
20 int r = 0;
21 register char *cp;
22 char *cpnew;
23 int junk;
24 unsigned int i,j;
25 unsigned int serial;
26
27 cp = *cpp;
28 serial = prefix->len; /* pointer to serial number */
29 if (serial)
30 serial = byte_rchr(prefix->s,prefix->len,'#');
31
32 junk = 1;
33 while (junk) {
34 junk = 0;
35 while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++;
36 cpnew = cp;
37 while (++cpnew <= cpend) { /* /(..+:\s)/ is a reply indicator */
38 if (*cpnew == ' ') {
39 if (cpnew < cp + 3) break; /* at least 3 char before ' ' */
40 if (*(cpnew - 1) != ':') break; /* require ':' before ' ' */
41 if (cpnew > cp + 5) { /* if > 4 char before ':' require */
42 register char ch;
43 ch = *(cpnew - 2); /* XX^3, XX[3], XX(3) */
44 if (ch != ')' && ch != ']' && (ch < '0' || ch > '9'))
45 break;
46 }
47 junk = 1;
48 r |= 1;
49 cp = cpnew + 1;
50 break;
51 }
52 }
53 /* prefix removal is complicated by the inconsistent handling of ' ' */
54 /* when there are rfc2047-encoded words in the subject. We first */
55 /* compare prefix before "serial" ignoring space, then skip the */
56 /* number, then compare after "serial". If both matched we've found */
57 /* the prefix. */
58 if (serial) {
59 cpnew = cp;
60 i = 0;
61 while (i < serial && cpnew <= cpend) {
62 if (*cpnew != ' ') {
63 if (prefix->s[i] == ' ') {
64 ++i;
65 continue;
66 }
67 if (*cpnew != prefix->s[i]) break;
68 ++i;
69 }
70 ++cpnew;
71 }
72 if (i == serial) { /* match before serial */
73 j = prefix->len;
74 if (serial != j) { /* got a '#' */
75 while (cpnew <= cpend && /* skip number/space */
76 *cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew;
77 i = serial + 1;
78 while (i < j && cpnew <= cpend) {
79 if (*cpnew != ' ') {
80 if (prefix->s[i] == ' ') {
81 ++i;
82 continue;
83 }
84 if (*cpnew != prefix->s[i]) break;
85 ++i;
86 }
87 ++cpnew;
88 }
89 }
90 if (i == j) {
91 cp = cpnew;
92 junk = 1;
93 r |= 2;
94 }
95 }
96 }
97 }
98 *cpp = cp;
99 return r;
100 }
101
102 static int trimend(indata,np,fatal)
103 char *indata;
104 unsigned int *np;
105 char *fatal;
106 /* looks at indata of length n from the end removing LWSP & '\n' */
107 /* and any trailing '-Reply'. Sets n to new length and returns: */
108 /* 0 - not reply, 1 - reply. */
109 {
110 char *cplast;
111 int junk;
112 int r = 0;
113
114 if (*np == 0) return 0;
115 cplast = indata + *np - 1; /* points to last char on line */
116 junk = 1;
117 while (junk) {
118 junk = 0;
119 while (cplast >= indata &&
120 (*cplast == ' ' || *cplast == '\t' ||
121 *cplast == '\r' || *cplast == '\n'))
122 --cplast;
123 if (cplast - indata >= 5 && case_startb(cplast - 5,6,"-Reply")) {
124 cplast -= 6;
125 r = 1;
126 junk = 1;
127 }
128 }
129 *np = (unsigned int) (cplast - indata + 1); /* new length */
130 return r;
131 }
132
133 int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal)
134 char *indata;
135 unsigned int n;
136 stralloc *outdata;
137 char *charset;
138 stralloc *prefix;
139 int flagtrimsub;
140 char *fatal;
141 /* takes a header as indata. Removal of reply-indicators is done */
142 /* but removal of line breaks and Q and B decoding should have */
143 /* been done. Returns a */
144 /* single line header without trailing \n or \0. Mainly, we */
145 /* remove redundant shift codes */
146 /* returns 0 = no reply no prefix */
147 /* 1 = reply no prefix */
148 /* 2 = no reply, prefix */
149 /* 3 = reply & pefix */
150 {
151 int r = 0;
152 char *cp,*cpesc,*cpnext,*cpend,*cpout;
153 char state,cset,newcset;
154 int reg,newreg;
155
156 cp = indata; /* JIS X 0201 -> ISO646 us-ascii */
157 cpend = cp + n - 1;
158 cpnext = cp;
159 if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal);
160 if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal);
161
162 if(!case_diffb(charset,11,"iso-2022-jp")) {
163 /* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
164 /* are from the rfc. Don't ask why they have multiple length G0 */
165 /* charset designations ... JIS X 0201-roman is identical to */
166 /* iso646 us-ascii except for currency and tilde. Making them the */
167 /* same increases hits without significant loss. JIS X 0208-1978 */
168 /* is superceded by JIS X 0208-1983 and converted here as well. */
169
170 while (cp < cpend) {
171 if (*cp++ != ESC) continue;
172 if (*cp == '(') {
173 if (++cp > cpend) break;
174 if (*cp == 'J') *cp = 'B';
175 ++cp;
176 } else if (*cp == '$') {
177 if (++cp > cpend) break;
178 if (*cp == '@') *cp = 'B';
179 ++cp;
180 }
181 }
182 /* eliminate redundant ESC seqs */
183 cp = indata;
184 cpnext = cp;
185 reg = 6;
186 while (cp < cpend) {
187 if (*cp++ != ESC) continue;
188 cpesc = cp - 1;
189 if (*cp == '$') {
190 if (++cp > cpend) break;
191 if (*cp == 'B') newreg = 87;
192 else if (*cp == 'A') newreg = 58;
193 else if (*cp == '(') {
194 if (++cp > cpend) break;
195 if (*cp == 'C') newreg = 149;
196 else if (*cp == 'D') newreg = 159;
197 else continue;
198 } else continue;
199 } else if (*cp == '(') {
200 if (++cp > cpend) break;
201 if (*cp == 'B') newreg = 6;
202 else continue;
203 } else continue;
204 if (++cp > cpend) break;
205 while (*cp == ' ' || *cp == '\t')
206 if (++cp >= cpend) break; /* skip space */
207 if (*cp == ESC) /* maybe another G0 designation */
208 if (*(cp+1) == '(' || *(cp+1) == '$') { /* yep! */
209 if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
210 cpnext = cp;
211 continue;
212 }
213 if (reg == newreg) {
214 if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
215 cpnext = cp;
216 } else {
217 reg = newreg;
218 } /* copy remainder of line */
219 }
220 if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
221 if (reg != 6) { /* need to return to us-ascii at the end of the line */
222 if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal);
223 } else { /* maybe "-Reply at the end?" */
224 r = trimend(tmpdata.s,&(tmpdata.len),fatal);
225 }
226
227 } else if (!case_diffb(charset,11,"iso-2022-cn") ||
228 !case_diffb(charset,11,"iso-2022-kr")) {
229 /* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
230 /* -cn-ext, 'x' can be a number of different letters. In -kr it's */
231 /* always 'C'. This routine may work also for other iso-2022 sets */
232 /* also handles iso-2022-cn-ext */
233 cpesc = (char *) 0; /* points to latest ESC */
234 state = SI; /* us-ascii */
235 --cp; /* set up for loop */
236
237 while (++cp <= cpend) {
238 if (*cp == SI || *cp == SO) {
239 if (state == *cp) { /* already in state. Skip shift seq */
240 if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal);
241 cpnext = cp;
242 } else /* set new state */
243 state = *cp;
244 if (++cp > cpend) break;
245 continue;
246 }
247 if (*cp != ESC) continue;
248 if (cp + 3 > cpend) break; /* not space for full SO-designation */
249 cpesc = cp;
250 if (*cp != '$') continue;
251 if (++cp > cpend) break;
252 if (*cp != ')') continue;
253 if (++cp > cpend) break;
254 newcset = *cp;
255 if (++cp > cpend) break;
256 while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp;
257 if (cp + 3 > cpend) break; /* no space for full SO-designation */
258 if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')')
259 || (newcset == cset)) {
260 /* skip if a second SO-designation right after or */
261 /* this SO-designation is already active, skip */
262 if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
263 --cp; /* "unpeek" so that next iteration will see char */
264 cpnext = cpesc + 4;
265 continue;
266 } else {
267 cset = newcset;
268 continue;
269 }
270 }
271 /* get remainder of line */
272 if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
273 if (state != SI) /* need to end in ascii */
274 if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal);
275 else /* ascii end; maybe "-Reply" at the end? */
276 r = trimend(tmpdata.s,&(tmpdata.len),fatal);
277
278 } else { /* other character sets = no special treatment */
279 r = trimend(cp,&n,fatal); /* -reply */
280 if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal);
281 }
282
283 cp = tmpdata.s;
284 n = tmpdata.len;
285 cpend = cp + n - 1;
286 if (flagtrimsub) { /* remove leading reply indicators & prefix*/
287 r |= trimre(&cp,cpend,prefix,fatal);
288 n = (unsigned int) (cpend-cp+1);
289 }
290 /* there shouldn't be '\0' or '\n', but make sure as */
291 /* it would break the message index */
292 if (!stralloc_copys(outdata,"")) die_nomem(fatal);
293 if (!stralloc_ready(outdata,n)) die_nomem(fatal);
294 outdata->len = n;
295 cpout = outdata->s;
296 while (n--) { /* '\n' and '\0' would break the subject index */
297 if (!*cp || *cp == '\n') *cpout = ' ';
298 else *cpout = *cp;
299 ++cp; ++cpout;
300 }
301 return r;
302 }
303