Commit | Line | Data |
---|---|---|
f8beb284 MW |
1 | /*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/ |
2 | /*$Name: ezmlm-idx-040 $*/ | |
3 | ||
4 | #include "stralloc.h" | |
5 | #include "strerr.h" | |
6 | #include "case.h" | |
7 | #include "byte.h" | |
8 | #include "errtxt.h" | |
9 | #include "mime.h" | |
10 | ||
11 | static stralloc tmpdata = {0}; | |
12 | ||
13 | static int trimre(cpp,cpend,prefix,fatal) | |
14 | char **cpp; | |
15 | char *cpend; | |
16 | stralloc *prefix; | |
17 | char *fatal; | |
18 | ||
19 | { | |
20 | int r = 0; | |
21 | register char *cp; | |
22 | char *cpnew; | |
23 | int junk; | |
24 | unsigned int i,j; | |
25 | unsigned int serial; | |
26 | ||
27 | cp = *cpp; | |
28 | serial = prefix->len; /* pointer to serial number */ | |
29 | if (serial) | |
30 | serial = byte_rchr(prefix->s,prefix->len,'#'); | |
31 | ||
32 | junk = 1; | |
33 | while (junk) { | |
34 | junk = 0; | |
35 | while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++; | |
36 | cpnew = cp; | |
37 | while (++cpnew <= cpend) { /* /(..+:\s)/ is a reply indicator */ | |
38 | if (*cpnew == ' ') { | |
39 | if (cpnew < cp + 3) break; /* at least 3 char before ' ' */ | |
40 | if (*(cpnew - 1) != ':') break; /* require ':' before ' ' */ | |
41 | if (cpnew > cp + 5) { /* if > 4 char before ':' require */ | |
42 | register char ch; | |
43 | ch = *(cpnew - 2); /* XX^3, XX[3], XX(3) */ | |
44 | if (ch != ')' && ch != ']' && (ch < '0' || ch > '9')) | |
45 | break; | |
46 | } | |
47 | junk = 1; | |
48 | r |= 1; | |
49 | cp = cpnew + 1; | |
50 | break; | |
51 | } | |
52 | } | |
53 | /* prefix removal is complicated by the inconsistent handling of ' ' */ | |
54 | /* when there are rfc2047-encoded words in the subject. We first */ | |
55 | /* compare prefix before "serial" ignoring space, then skip the */ | |
56 | /* number, then compare after "serial". If both matched we've found */ | |
57 | /* the prefix. */ | |
58 | if (serial) { | |
59 | cpnew = cp; | |
60 | i = 0; | |
61 | while (i < serial && cpnew <= cpend) { | |
62 | if (*cpnew != ' ') { | |
63 | if (prefix->s[i] == ' ') { | |
64 | ++i; | |
65 | continue; | |
66 | } | |
67 | if (*cpnew != prefix->s[i]) break; | |
68 | ++i; | |
69 | } | |
70 | ++cpnew; | |
71 | } | |
72 | if (i == serial) { /* match before serial */ | |
73 | j = prefix->len; | |
74 | if (serial != j) { /* got a '#' */ | |
75 | while (cpnew <= cpend && /* skip number/space */ | |
76 | *cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew; | |
77 | i = serial + 1; | |
78 | while (i < j && cpnew <= cpend) { | |
79 | if (*cpnew != ' ') { | |
80 | if (prefix->s[i] == ' ') { | |
81 | ++i; | |
82 | continue; | |
83 | } | |
84 | if (*cpnew != prefix->s[i]) break; | |
85 | ++i; | |
86 | } | |
87 | ++cpnew; | |
88 | } | |
89 | } | |
90 | if (i == j) { | |
91 | cp = cpnew; | |
92 | junk = 1; | |
93 | r |= 2; | |
94 | } | |
95 | } | |
96 | } | |
97 | } | |
98 | *cpp = cp; | |
99 | return r; | |
100 | } | |
101 | ||
102 | static int trimend(indata,np,fatal) | |
103 | char *indata; | |
104 | unsigned int *np; | |
105 | char *fatal; | |
106 | /* looks at indata of length n from the end removing LWSP & '\n' */ | |
107 | /* and any trailing '-Reply'. Sets n to new length and returns: */ | |
108 | /* 0 - not reply, 1 - reply. */ | |
109 | { | |
110 | char *cplast; | |
111 | int junk; | |
112 | int r = 0; | |
113 | ||
114 | if (*np == 0) return 0; | |
115 | cplast = indata + *np - 1; /* points to last char on line */ | |
116 | junk = 1; | |
117 | while (junk) { | |
118 | junk = 0; | |
119 | while (cplast >= indata && | |
120 | (*cplast == ' ' || *cplast == '\t' || | |
121 | *cplast == '\r' || *cplast == '\n')) | |
122 | --cplast; | |
123 | if (cplast - indata >= 5 && case_startb(cplast - 5,6,"-Reply")) { | |
124 | cplast -= 6; | |
125 | r = 1; | |
126 | junk = 1; | |
127 | } | |
128 | } | |
129 | *np = (unsigned int) (cplast - indata + 1); /* new length */ | |
130 | return r; | |
131 | } | |
132 | ||
133 | int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal) | |
134 | char *indata; | |
135 | unsigned int n; | |
136 | stralloc *outdata; | |
137 | char *charset; | |
138 | stralloc *prefix; | |
139 | int flagtrimsub; | |
140 | char *fatal; | |
141 | /* takes a header as indata. Removal of reply-indicators is done */ | |
142 | /* but removal of line breaks and Q and B decoding should have */ | |
143 | /* been done. Returns a */ | |
144 | /* single line header without trailing \n or \0. Mainly, we */ | |
145 | /* remove redundant shift codes */ | |
146 | /* returns 0 = no reply no prefix */ | |
147 | /* 1 = reply no prefix */ | |
148 | /* 2 = no reply, prefix */ | |
149 | /* 3 = reply & pefix */ | |
150 | { | |
151 | int r = 0; | |
152 | char *cp,*cpesc,*cpnext,*cpend,*cpout; | |
153 | char state,cset,newcset; | |
154 | int reg,newreg; | |
155 | ||
156 | cp = indata; /* JIS X 0201 -> ISO646 us-ascii */ | |
157 | cpend = cp + n - 1; | |
158 | cpnext = cp; | |
159 | if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal); | |
160 | if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal); | |
161 | ||
162 | if(!case_diffb(charset,11,"iso-2022-jp")) { | |
163 | /* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */ | |
164 | /* are from the rfc. Don't ask why they have multiple length G0 */ | |
165 | /* charset designations ... JIS X 0201-roman is identical to */ | |
166 | /* iso646 us-ascii except for currency and tilde. Making them the */ | |
167 | /* same increases hits without significant loss. JIS X 0208-1978 */ | |
168 | /* is superceded by JIS X 0208-1983 and converted here as well. */ | |
169 | ||
170 | while (cp < cpend) { | |
171 | if (*cp++ != ESC) continue; | |
172 | if (*cp == '(') { | |
173 | if (++cp > cpend) break; | |
174 | if (*cp == 'J') *cp = 'B'; | |
175 | ++cp; | |
176 | } else if (*cp == '$') { | |
177 | if (++cp > cpend) break; | |
178 | if (*cp == '@') *cp = 'B'; | |
179 | ++cp; | |
180 | } | |
181 | } | |
182 | /* eliminate redundant ESC seqs */ | |
183 | cp = indata; | |
184 | cpnext = cp; | |
185 | reg = 6; | |
186 | while (cp < cpend) { | |
187 | if (*cp++ != ESC) continue; | |
188 | cpesc = cp - 1; | |
189 | if (*cp == '$') { | |
190 | if (++cp > cpend) break; | |
191 | if (*cp == 'B') newreg = 87; | |
192 | else if (*cp == 'A') newreg = 58; | |
193 | else if (*cp == '(') { | |
194 | if (++cp > cpend) break; | |
195 | if (*cp == 'C') newreg = 149; | |
196 | else if (*cp == 'D') newreg = 159; | |
197 | else continue; | |
198 | } else continue; | |
199 | } else if (*cp == '(') { | |
200 | if (++cp > cpend) break; | |
201 | if (*cp == 'B') newreg = 6; | |
202 | else continue; | |
203 | } else continue; | |
204 | if (++cp > cpend) break; | |
205 | while (*cp == ' ' || *cp == '\t') | |
206 | if (++cp >= cpend) break; /* skip space */ | |
207 | if (*cp == ESC) /* maybe another G0 designation */ | |
208 | if (*(cp+1) == '(' || *(cp+1) == '$') { /* yep! */ | |
209 | if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal); | |
210 | cpnext = cp; | |
211 | continue; | |
212 | } | |
213 | if (reg == newreg) { | |
214 | if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal); | |
215 | cpnext = cp; | |
216 | } else { | |
217 | reg = newreg; | |
218 | } /* copy remainder of line */ | |
219 | } | |
220 | if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal); | |
221 | if (reg != 6) { /* need to return to us-ascii at the end of the line */ | |
222 | if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal); | |
223 | } else { /* maybe "-Reply at the end?" */ | |
224 | r = trimend(tmpdata.s,&(tmpdata.len),fatal); | |
225 | } | |
226 | ||
227 | } else if (!case_diffb(charset,11,"iso-2022-cn") || | |
228 | !case_diffb(charset,11,"iso-2022-kr")) { | |
229 | /* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */ | |
230 | /* -cn-ext, 'x' can be a number of different letters. In -kr it's */ | |
231 | /* always 'C'. This routine may work also for other iso-2022 sets */ | |
232 | /* also handles iso-2022-cn-ext */ | |
233 | cpesc = (char *) 0; /* points to latest ESC */ | |
234 | state = SI; /* us-ascii */ | |
235 | --cp; /* set up for loop */ | |
236 | ||
237 | while (++cp <= cpend) { | |
238 | if (*cp == SI || *cp == SO) { | |
239 | if (state == *cp) { /* already in state. Skip shift seq */ | |
240 | if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal); | |
241 | cpnext = cp; | |
242 | } else /* set new state */ | |
243 | state = *cp; | |
244 | if (++cp > cpend) break; | |
245 | continue; | |
246 | } | |
247 | if (*cp != ESC) continue; | |
248 | if (cp + 3 > cpend) break; /* not space for full SO-designation */ | |
249 | cpesc = cp; | |
250 | if (*cp != '$') continue; | |
251 | if (++cp > cpend) break; | |
252 | if (*cp != ')') continue; | |
253 | if (++cp > cpend) break; | |
254 | newcset = *cp; | |
255 | if (++cp > cpend) break; | |
256 | while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp; | |
257 | if (cp + 3 > cpend) break; /* no space for full SO-designation */ | |
258 | if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')') | |
259 | || (newcset == cset)) { | |
260 | /* skip if a second SO-designation right after or */ | |
261 | /* this SO-designation is already active, skip */ | |
262 | if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal); | |
263 | --cp; /* "unpeek" so that next iteration will see char */ | |
264 | cpnext = cpesc + 4; | |
265 | continue; | |
266 | } else { | |
267 | cset = newcset; | |
268 | continue; | |
269 | } | |
270 | } | |
271 | /* get remainder of line */ | |
272 | if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal); | |
273 | if (state != SI) /* need to end in ascii */ | |
274 | if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal); | |
275 | else /* ascii end; maybe "-Reply" at the end? */ | |
276 | r = trimend(tmpdata.s,&(tmpdata.len),fatal); | |
277 | ||
278 | } else { /* other character sets = no special treatment */ | |
279 | r = trimend(cp,&n,fatal); /* -reply */ | |
280 | if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal); | |
281 | } | |
282 | ||
283 | cp = tmpdata.s; | |
284 | n = tmpdata.len; | |
285 | cpend = cp + n - 1; | |
286 | if (flagtrimsub) { /* remove leading reply indicators & prefix*/ | |
287 | r |= trimre(&cp,cpend,prefix,fatal); | |
288 | n = (unsigned int) (cpend-cp+1); | |
289 | } | |
290 | /* there shouldn't be '\0' or '\n', but make sure as */ | |
291 | /* it would break the message index */ | |
292 | if (!stralloc_copys(outdata,"")) die_nomem(fatal); | |
293 | if (!stralloc_ready(outdata,n)) die_nomem(fatal); | |
294 | outdata->len = n; | |
295 | cpout = outdata->s; | |
296 | while (n--) { /* '\n' and '\0' would break the subject index */ | |
297 | if (!*cp || *cp == '\n') *cpout = ' '; | |
298 | else *cpout = *cp; | |
299 | ++cp; ++cpout; | |
300 | } | |
301 | return r; | |
302 | } | |
303 |