| 1 | /*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/ |
| 2 | /*$Name: ezmlm-idx-040 $*/ |
| 3 | |
| 4 | #include "stralloc.h" |
| 5 | #include "strerr.h" |
| 6 | #include "case.h" |
| 7 | #include "byte.h" |
| 8 | #include "errtxt.h" |
| 9 | #include "mime.h" |
| 10 | |
| 11 | static stralloc tmpdata = {0}; |
| 12 | |
| 13 | static int trimre(cpp,cpend,prefix,fatal) |
| 14 | char **cpp; |
| 15 | char *cpend; |
| 16 | stralloc *prefix; |
| 17 | char *fatal; |
| 18 | |
| 19 | { |
| 20 | int r = 0; |
| 21 | register char *cp; |
| 22 | char *cpnew; |
| 23 | int junk; |
| 24 | unsigned int i,j; |
| 25 | unsigned int serial; |
| 26 | |
| 27 | cp = *cpp; |
| 28 | serial = prefix->len; /* pointer to serial number */ |
| 29 | if (serial) |
| 30 | serial = byte_rchr(prefix->s,prefix->len,'#'); |
| 31 | |
| 32 | junk = 1; |
| 33 | while (junk) { |
| 34 | junk = 0; |
| 35 | while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++; |
| 36 | cpnew = cp; |
| 37 | while (++cpnew <= cpend) { /* /(..+:\s)/ is a reply indicator */ |
| 38 | if (*cpnew == ' ') { |
| 39 | if (cpnew < cp + 3) break; /* at least 3 char before ' ' */ |
| 40 | if (*(cpnew - 1) != ':') break; /* require ':' before ' ' */ |
| 41 | if (cpnew > cp + 5) { /* if > 4 char before ':' require */ |
| 42 | register char ch; |
| 43 | ch = *(cpnew - 2); /* XX^3, XX[3], XX(3) */ |
| 44 | if (ch != ')' && ch != ']' && (ch < '0' || ch > '9')) |
| 45 | break; |
| 46 | } |
| 47 | junk = 1; |
| 48 | r |= 1; |
| 49 | cp = cpnew + 1; |
| 50 | break; |
| 51 | } |
| 52 | } |
| 53 | /* prefix removal is complicated by the inconsistent handling of ' ' */ |
| 54 | /* when there are rfc2047-encoded words in the subject. We first */ |
| 55 | /* compare prefix before "serial" ignoring space, then skip the */ |
| 56 | /* number, then compare after "serial". If both matched we've found */ |
| 57 | /* the prefix. */ |
| 58 | if (serial) { |
| 59 | cpnew = cp; |
| 60 | i = 0; |
| 61 | while (i < serial && cpnew <= cpend) { |
| 62 | if (*cpnew != ' ') { |
| 63 | if (prefix->s[i] == ' ') { |
| 64 | ++i; |
| 65 | continue; |
| 66 | } |
| 67 | if (*cpnew != prefix->s[i]) break; |
| 68 | ++i; |
| 69 | } |
| 70 | ++cpnew; |
| 71 | } |
| 72 | if (i == serial) { /* match before serial */ |
| 73 | j = prefix->len; |
| 74 | if (serial != j) { /* got a '#' */ |
| 75 | while (cpnew <= cpend && /* skip number/space */ |
| 76 | *cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew; |
| 77 | i = serial + 1; |
| 78 | while (i < j && cpnew <= cpend) { |
| 79 | if (*cpnew != ' ') { |
| 80 | if (prefix->s[i] == ' ') { |
| 81 | ++i; |
| 82 | continue; |
| 83 | } |
| 84 | if (*cpnew != prefix->s[i]) break; |
| 85 | ++i; |
| 86 | } |
| 87 | ++cpnew; |
| 88 | } |
| 89 | } |
| 90 | if (i == j) { |
| 91 | cp = cpnew; |
| 92 | junk = 1; |
| 93 | r |= 2; |
| 94 | } |
| 95 | } |
| 96 | } |
| 97 | } |
| 98 | *cpp = cp; |
| 99 | return r; |
| 100 | } |
| 101 | |
| 102 | static int trimend(indata,np,fatal) |
| 103 | char *indata; |
| 104 | unsigned int *np; |
| 105 | char *fatal; |
| 106 | /* looks at indata of length n from the end removing LWSP & '\n' */ |
| 107 | /* and any trailing '-Reply'. Sets n to new length and returns: */ |
| 108 | /* 0 - not reply, 1 - reply. */ |
| 109 | { |
| 110 | char *cplast; |
| 111 | int junk; |
| 112 | int r = 0; |
| 113 | |
| 114 | if (*np == 0) return 0; |
| 115 | cplast = indata + *np - 1; /* points to last char on line */ |
| 116 | junk = 1; |
| 117 | while (junk) { |
| 118 | junk = 0; |
| 119 | while (cplast >= indata && |
| 120 | (*cplast == ' ' || *cplast == '\t' || |
| 121 | *cplast == '\r' || *cplast == '\n')) |
| 122 | --cplast; |
| 123 | if (cplast - indata >= 5 && case_startb(cplast - 5,6,"-Reply")) { |
| 124 | cplast -= 6; |
| 125 | r = 1; |
| 126 | junk = 1; |
| 127 | } |
| 128 | } |
| 129 | *np = (unsigned int) (cplast - indata + 1); /* new length */ |
| 130 | return r; |
| 131 | } |
| 132 | |
| 133 | int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal) |
| 134 | char *indata; |
| 135 | unsigned int n; |
| 136 | stralloc *outdata; |
| 137 | char *charset; |
| 138 | stralloc *prefix; |
| 139 | int flagtrimsub; |
| 140 | char *fatal; |
| 141 | /* takes a header as indata. Removal of reply-indicators is done */ |
| 142 | /* but removal of line breaks and Q and B decoding should have */ |
| 143 | /* been done. Returns a */ |
| 144 | /* single line header without trailing \n or \0. Mainly, we */ |
| 145 | /* remove redundant shift codes */ |
| 146 | /* returns 0 = no reply no prefix */ |
| 147 | /* 1 = reply no prefix */ |
| 148 | /* 2 = no reply, prefix */ |
| 149 | /* 3 = reply & pefix */ |
| 150 | { |
| 151 | int r = 0; |
| 152 | char *cp,*cpesc,*cpnext,*cpend,*cpout; |
| 153 | char state,cset,newcset; |
| 154 | int reg,newreg; |
| 155 | |
| 156 | cp = indata; /* JIS X 0201 -> ISO646 us-ascii */ |
| 157 | cpend = cp + n - 1; |
| 158 | cpnext = cp; |
| 159 | if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal); |
| 160 | if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal); |
| 161 | |
| 162 | if(!case_diffb(charset,11,"iso-2022-jp")) { |
| 163 | /* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */ |
| 164 | /* are from the rfc. Don't ask why they have multiple length G0 */ |
| 165 | /* charset designations ... JIS X 0201-roman is identical to */ |
| 166 | /* iso646 us-ascii except for currency and tilde. Making them the */ |
| 167 | /* same increases hits without significant loss. JIS X 0208-1978 */ |
| 168 | /* is superceded by JIS X 0208-1983 and converted here as well. */ |
| 169 | |
| 170 | while (cp < cpend) { |
| 171 | if (*cp++ != ESC) continue; |
| 172 | if (*cp == '(') { |
| 173 | if (++cp > cpend) break; |
| 174 | if (*cp == 'J') *cp = 'B'; |
| 175 | ++cp; |
| 176 | } else if (*cp == '$') { |
| 177 | if (++cp > cpend) break; |
| 178 | if (*cp == '@') *cp = 'B'; |
| 179 | ++cp; |
| 180 | } |
| 181 | } |
| 182 | /* eliminate redundant ESC seqs */ |
| 183 | cp = indata; |
| 184 | cpnext = cp; |
| 185 | reg = 6; |
| 186 | while (cp < cpend) { |
| 187 | if (*cp++ != ESC) continue; |
| 188 | cpesc = cp - 1; |
| 189 | if (*cp == '$') { |
| 190 | if (++cp > cpend) break; |
| 191 | if (*cp == 'B') newreg = 87; |
| 192 | else if (*cp == 'A') newreg = 58; |
| 193 | else if (*cp == '(') { |
| 194 | if (++cp > cpend) break; |
| 195 | if (*cp == 'C') newreg = 149; |
| 196 | else if (*cp == 'D') newreg = 159; |
| 197 | else continue; |
| 198 | } else continue; |
| 199 | } else if (*cp == '(') { |
| 200 | if (++cp > cpend) break; |
| 201 | if (*cp == 'B') newreg = 6; |
| 202 | else continue; |
| 203 | } else continue; |
| 204 | if (++cp > cpend) break; |
| 205 | while (*cp == ' ' || *cp == '\t') |
| 206 | if (++cp >= cpend) break; /* skip space */ |
| 207 | if (*cp == ESC) /* maybe another G0 designation */ |
| 208 | if (*(cp+1) == '(' || *(cp+1) == '$') { /* yep! */ |
| 209 | if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal); |
| 210 | cpnext = cp; |
| 211 | continue; |
| 212 | } |
| 213 | if (reg == newreg) { |
| 214 | if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal); |
| 215 | cpnext = cp; |
| 216 | } else { |
| 217 | reg = newreg; |
| 218 | } /* copy remainder of line */ |
| 219 | } |
| 220 | if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal); |
| 221 | if (reg != 6) { /* need to return to us-ascii at the end of the line */ |
| 222 | if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal); |
| 223 | } else { /* maybe "-Reply at the end?" */ |
| 224 | r = trimend(tmpdata.s,&(tmpdata.len),fatal); |
| 225 | } |
| 226 | |
| 227 | } else if (!case_diffb(charset,11,"iso-2022-cn") || |
| 228 | !case_diffb(charset,11,"iso-2022-kr")) { |
| 229 | /* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */ |
| 230 | /* -cn-ext, 'x' can be a number of different letters. In -kr it's */ |
| 231 | /* always 'C'. This routine may work also for other iso-2022 sets */ |
| 232 | /* also handles iso-2022-cn-ext */ |
| 233 | cpesc = (char *) 0; /* points to latest ESC */ |
| 234 | state = SI; /* us-ascii */ |
| 235 | --cp; /* set up for loop */ |
| 236 | |
| 237 | while (++cp <= cpend) { |
| 238 | if (*cp == SI || *cp == SO) { |
| 239 | if (state == *cp) { /* already in state. Skip shift seq */ |
| 240 | if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal); |
| 241 | cpnext = cp; |
| 242 | } else /* set new state */ |
| 243 | state = *cp; |
| 244 | if (++cp > cpend) break; |
| 245 | continue; |
| 246 | } |
| 247 | if (*cp != ESC) continue; |
| 248 | if (cp + 3 > cpend) break; /* not space for full SO-designation */ |
| 249 | cpesc = cp; |
| 250 | if (*cp != '$') continue; |
| 251 | if (++cp > cpend) break; |
| 252 | if (*cp != ')') continue; |
| 253 | if (++cp > cpend) break; |
| 254 | newcset = *cp; |
| 255 | if (++cp > cpend) break; |
| 256 | while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp; |
| 257 | if (cp + 3 > cpend) break; /* no space for full SO-designation */ |
| 258 | if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')') |
| 259 | || (newcset == cset)) { |
| 260 | /* skip if a second SO-designation right after or */ |
| 261 | /* this SO-designation is already active, skip */ |
| 262 | if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal); |
| 263 | --cp; /* "unpeek" so that next iteration will see char */ |
| 264 | cpnext = cpesc + 4; |
| 265 | continue; |
| 266 | } else { |
| 267 | cset = newcset; |
| 268 | continue; |
| 269 | } |
| 270 | } |
| 271 | /* get remainder of line */ |
| 272 | if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal); |
| 273 | if (state != SI) /* need to end in ascii */ |
| 274 | if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal); |
| 275 | else /* ascii end; maybe "-Reply" at the end? */ |
| 276 | r = trimend(tmpdata.s,&(tmpdata.len),fatal); |
| 277 | |
| 278 | } else { /* other character sets = no special treatment */ |
| 279 | r = trimend(cp,&n,fatal); /* -reply */ |
| 280 | if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal); |
| 281 | } |
| 282 | |
| 283 | cp = tmpdata.s; |
| 284 | n = tmpdata.len; |
| 285 | cpend = cp + n - 1; |
| 286 | if (flagtrimsub) { /* remove leading reply indicators & prefix*/ |
| 287 | r |= trimre(&cp,cpend,prefix,fatal); |
| 288 | n = (unsigned int) (cpend-cp+1); |
| 289 | } |
| 290 | /* there shouldn't be '\0' or '\n', but make sure as */ |
| 291 | /* it would break the message index */ |
| 292 | if (!stralloc_copys(outdata,"")) die_nomem(fatal); |
| 293 | if (!stralloc_ready(outdata,n)) die_nomem(fatal); |
| 294 | outdata->len = n; |
| 295 | cpout = outdata->s; |
| 296 | while (n--) { /* '\n' and '\0' would break the subject index */ |
| 297 | if (!*cp || *cp == '\n') *cpout = ' '; |
| 298 | else *cpout = *cp; |
| 299 | ++cp; ++cpout; |
| 300 | } |
| 301 | return r; |
| 302 | } |
| 303 | |