1 /*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/
2 /*$Name: ezmlm-idx-040 $*/
11 static stralloc tmpdata
= {0};
13 static int trimre(cpp
,cpend
,prefix
,fatal
)
28 serial
= prefix
->len
; /* pointer to serial number */
30 serial
= byte_rchr(prefix
->s
,prefix
->len
,'#');
35 while (cp
<= cpend
&& (*cp
== ' ' || *cp
== '\t')) cp
++;
37 while (++cpnew
<= cpend
) { /* /(..+:\s)/ is a reply indicator */
39 if (cpnew
< cp
+ 3) break; /* at least 3 char before ' ' */
40 if (*(cpnew
- 1) != ':') break; /* require ':' before ' ' */
41 if (cpnew
> cp
+ 5) { /* if > 4 char before ':' require */
43 ch
= *(cpnew
- 2); /* XX^3, XX[3], XX(3) */
44 if (ch
!= ')' && ch
!= ']' && (ch
< '0' || ch
> '9'))
53 /* prefix removal is complicated by the inconsistent handling of ' ' */
54 /* when there are rfc2047-encoded words in the subject. We first */
55 /* compare prefix before "serial" ignoring space, then skip the */
56 /* number, then compare after "serial". If both matched we've found */
61 while (i
< serial
&& cpnew
<= cpend
) {
63 if (prefix
->s
[i
] == ' ') {
67 if (*cpnew
!= prefix
->s
[i
]) break;
72 if (i
== serial
) { /* match before serial */
74 if (serial
!= j
) { /* got a '#' */
75 while (cpnew
<= cpend
&& /* skip number/space */
76 *cpnew
== ' ' || (*cpnew
<= '9' && *cpnew
>= '0')) ++cpnew
;
78 while (i
< j
&& cpnew
<= cpend
) {
80 if (prefix
->s
[i
] == ' ') {
84 if (*cpnew
!= prefix
->s
[i
]) break;
102 static int trimend(indata
,np
,fatal
)
106 /* looks at indata of length n from the end removing LWSP & '\n' */
107 /* and any trailing '-Reply'. Sets n to new length and returns: */
108 /* 0 - not reply, 1 - reply. */
114 if (*np
== 0) return 0;
115 cplast
= indata
+ *np
- 1; /* points to last char on line */
119 while (cplast
>= indata
&&
120 (*cplast
== ' ' || *cplast
== '\t' ||
121 *cplast
== '\r' || *cplast
== '\n'))
123 if (cplast
- indata
>= 5 && case_startb(cplast
- 5,6,"-Reply")) {
129 *np
= (unsigned int) (cplast
- indata
+ 1); /* new length */
133 int unfoldHDR(indata
,n
,outdata
,charset
,prefix
,flagtrimsub
,fatal
)
141 /* takes a header as indata. Removal of reply-indicators is done */
142 /* but removal of line breaks and Q and B decoding should have */
143 /* been done. Returns a */
144 /* single line header without trailing \n or \0. Mainly, we */
145 /* remove redundant shift codes */
146 /* returns 0 = no reply no prefix */
147 /* 1 = reply no prefix */
148 /* 2 = no reply, prefix */
149 /* 3 = reply & pefix */
152 char *cp
,*cpesc
,*cpnext
,*cpend
,*cpout
;
153 char state
,cset
,newcset
;
156 cp
= indata
; /* JIS X 0201 -> ISO646 us-ascii */
159 if (!stralloc_copys(&tmpdata
,"")) die_nomem(fatal
);
160 if (!stralloc_ready(&tmpdata
,n
)) die_nomem(fatal
);
162 if(!case_diffb(charset
,11,"iso-2022-jp")) {
163 /* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
164 /* are from the rfc. Don't ask why they have multiple length G0 */
165 /* charset designations ... JIS X 0201-roman is identical to */
166 /* iso646 us-ascii except for currency and tilde. Making them the */
167 /* same increases hits without significant loss. JIS X 0208-1978 */
168 /* is superceded by JIS X 0208-1983 and converted here as well. */
171 if (*cp
++ != ESC
) continue;
173 if (++cp
> cpend
) break;
174 if (*cp
== 'J') *cp
= 'B';
176 } else if (*cp
== '$') {
177 if (++cp
> cpend
) break;
178 if (*cp
== '@') *cp
= 'B';
182 /* eliminate redundant ESC seqs */
187 if (*cp
++ != ESC
) continue;
190 if (++cp
> cpend
) break;
191 if (*cp
== 'B') newreg
= 87;
192 else if (*cp
== 'A') newreg
= 58;
193 else if (*cp
== '(') {
194 if (++cp
> cpend
) break;
195 if (*cp
== 'C') newreg
= 149;
196 else if (*cp
== 'D') newreg
= 159;
199 } else if (*cp
== '(') {
200 if (++cp
> cpend
) break;
201 if (*cp
== 'B') newreg
= 6;
204 if (++cp
> cpend
) break;
205 while (*cp
== ' ' || *cp
== '\t')
206 if (++cp
>= cpend
) break; /* skip space */
207 if (*cp
== ESC
) /* maybe another G0 designation */
208 if (*(cp
+1) == '(' || *(cp
+1) == '$') { /* yep! */
209 if (!stralloc_catb(&tmpdata
,cpnext
,cpesc
-cpnext
)) die_nomem(fatal
);
214 if (!stralloc_catb(&tmpdata
,cpnext
,cpesc
-cpnext
)) die_nomem(fatal
);
218 } /* copy remainder of line */
220 if (!stralloc_catb(&tmpdata
,cpnext
,cpend
- cpnext
+ 1)) die_nomem(fatal
);
221 if (reg
!= 6) { /* need to return to us-ascii at the end of the line */
222 if (!stralloc_cats(&tmpdata
,TOASCII
)) die_nomem(fatal
);
223 } else { /* maybe "-Reply at the end?" */
224 r
= trimend(tmpdata
.s
,&(tmpdata
.len
),fatal
);
227 } else if (!case_diffb(charset
,11,"iso-2022-cn") ||
228 !case_diffb(charset
,11,"iso-2022-kr")) {
229 /* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
230 /* -cn-ext, 'x' can be a number of different letters. In -kr it's */
231 /* always 'C'. This routine may work also for other iso-2022 sets */
232 /* also handles iso-2022-cn-ext */
233 cpesc
= (char *) 0; /* points to latest ESC */
234 state
= SI
; /* us-ascii */
235 --cp
; /* set up for loop */
237 while (++cp
<= cpend
) {
238 if (*cp
== SI
|| *cp
== SO
) {
239 if (state
== *cp
) { /* already in state. Skip shift seq */
240 if (!stralloc_catb(&tmpdata
,cpnext
,cp
-cpnext
-1)) die_nomem(fatal
);
242 } else /* set new state */
244 if (++cp
> cpend
) break;
247 if (*cp
!= ESC
) continue;
248 if (cp
+ 3 > cpend
) break; /* not space for full SO-designation */
250 if (*cp
!= '$') continue;
251 if (++cp
> cpend
) break;
252 if (*cp
!= ')') continue;
253 if (++cp
> cpend
) break;
255 if (++cp
> cpend
) break;
256 while (cp
<= cpend
&& (*cp
== ' ' || *cp
== '\t')) ++cp
;
257 if (cp
+ 3 > cpend
) break; /* no space for full SO-designation */
258 if ((*cp
== ESC
&& *(cp
+1) == '$' && *(cp
+2) == ')')
259 || (newcset
== cset
)) {
260 /* skip if a second SO-designation right after or */
261 /* this SO-designation is already active, skip */
262 if (!stralloc_catb(&tmpdata
,cpnext
,cpesc
-cpnext
)) die_nomem(fatal
);
263 --cp
; /* "unpeek" so that next iteration will see char */
271 /* get remainder of line */
272 if (!stralloc_catb(&tmpdata
,cpnext
,cpend
- cpnext
+ 1)) die_nomem(fatal
);
273 if (state
!= SI
) /* need to end in ascii */
274 if (!stralloc_cats(&tmpdata
,TOSI
)) die_nomem(fatal
);
275 else /* ascii end; maybe "-Reply" at the end? */
276 r
= trimend(tmpdata
.s
,&(tmpdata
.len
),fatal
);
278 } else { /* other character sets = no special treatment */
279 r
= trimend(cp
,&n
,fatal
); /* -reply */
280 if (!stralloc_copyb(&tmpdata
,cp
,n
)) die_nomem(fatal
);
286 if (flagtrimsub
) { /* remove leading reply indicators & prefix*/
287 r
|= trimre(&cp
,cpend
,prefix
,fatal
);
288 n
= (unsigned int) (cpend
-cp
+1);
290 /* there shouldn't be '\0' or '\n', but make sure as */
291 /* it would break the message index */
292 if (!stralloc_copys(outdata
,"")) die_nomem(fatal
);
293 if (!stralloc_ready(outdata
,n
)) die_nomem(fatal
);
296 while (n
--) { /* '\n' and '\0' would break the subject index */
297 if (!*cp
|| *cp
== '\n') *cpout
= ' ';