X-Git-Url: https://git.distorted.org.uk/~mdw/ezmlm/blobdiff_plain/5b62e993b0af39700031c2875d7f6654e6a02850..f8beb284087c279acfb30506f5bb32baa4949b44:/idxthread.c diff --git a/idxthread.c b/idxthread.c new file mode 100644 index 0000000..b8ed2b0 --- /dev/null +++ b/idxthread.c @@ -0,0 +1,682 @@ +/*$Id: idxthread.c,v 1.35 1999/11/22 01:47:45 lindberg Exp $*/ +/*$Name: ezmlm-idx-040 $*/ + +/* idxthread.c contains routines to from the ezmlm-idx subject index build */ +/* a structure of unique subjects as well as a table of messages with */ +/* pointers to the subject. This leads to information on message threads */ +/* arranged chronologically within the thread, and with the threads */ +/* arranged chronologically by the first message within the range. */ +/* idx_mkthreads() will arrange the author list in a similar manner. This */ +/* saves some space, and takes a little extra time. It's needed when */ +/* generating an author index. */ + +#include +#include +#include "error.h" +#include "alloc.h" +#include "str.h" +#include "stralloc.h" +#include "strerr.h" +#include "lock.h" +#include "idx.h" +#include "errtxt.h" +#include "substdio.h" +#include "fmt.h" +#include "readwrite.h" +#include "makehash.h" +#include "yyyymm.h" + +#define DATENO 100 +static stralloc line = {0}; /* primary input */ +static stralloc authline = {0}; /* second line of primary input */ +static stralloc dummyind = {0}; + +static substdio ssindex; +static char indexbuf[1024]; + +static char strnum[FMT_ULONG]; + +struct stat st; + /* if no data, these may be the entire table, so */ + /* need to be static */ +static subentry sdummy; +static authentry adummy; + + +static void die_nomem(fatal) +char *fatal; +{ + strerr_die2x(111,fatal,ERR_NOMEM); +} + +int fdlock; + +/* NOTE: These do NOT prevent double locking */ +static void lockup(fatal) +char *fatal; +{ + fdlock = open_append("lock"); + if (fdlock == -1) + strerr_die2sys(111,fatal,ERR_OPEN_LOCK); + if (lock_ex(fdlock) == -1) { + close(fdlock); + strerr_die2sys(111,fatal,ERR_OBTAIN_LOCK); + } +} + +static void unlock() +{ + close(fdlock); +} + +static void newsub(psubt,subject,sublen,msg,fatal) +/* Initializes subentry pointed to by psubt, adds a '\0' to subject, */ +/* allocates space and copies in subject, and puts a pointer to it in */ +/* the entry. */ +subentry *psubt; +char *subject; +unsigned int sublen; +unsigned long msg; +char *fatal; +{ + register char *cpfrom, *cpto; + register unsigned int cpno; + + psubt->higher = (subentry *) 0; + psubt->lower = (subentry *) 0; + psubt->firstmsg = msg; + psubt->lastmsg = msg; + psubt->msginthread = 1; + if (!(psubt->sub = alloc ((sublen) * sizeof(char)))) + die_nomem(fatal); + cpto = psubt->sub; + cpno = sublen; + cpfrom = subject; + while (cpno--) *(cpto++) = *(cpfrom++); + psubt->sublen = sublen; +} + +static void newauth(pautht,author,authlen,msg,fatal) +/* Allocates space for author of length authlen+1 adding a terminal '\0' */ +/* and puts the pointer in pautht->auth. Analog to newsub(). */ +authentry *pautht; /* entry for current message */ +char *author; /* pointer to author string (not sz!) */ +unsigned int authlen; /* lenth of author */ +unsigned long msg; +char *fatal; /* sz */ + +{ + register char *cpfrom, *cpto; + register unsigned int cpno; + + pautht->higher = (subentry *) 0; + pautht->lower = (subentry *) 0; + pautht->firstmsg = msg; + if (!(pautht->auth = alloc ((authlen) * sizeof(char)))) + die_nomem(fatal); + cpto = pautht->auth; + cpno = authlen; + cpfrom = author; + while (cpno--) *(cpto++) = *(cpfrom++); + pautht->authlen = authlen; +} + +static void init_dummy(fatal) +char *fatal; +{ + unsigned int i; + + if (!stralloc_ready(&dummyind,HASHLEN + 1)) die_nomem(fatal); + for (i = 0; i< HASHLEN; i++) + dummyind.s[i] = 'a'; + dummyind.len = HASHLEN; + if (!stralloc_append(&dummyind," ")) die_nomem(fatal); +} + +void idx_mkthreads(pmsgtable,psubtable,pauthtable,pdatetable, + msg_from,msg_to,msg_latest,locked,fatal) +/* Threads messages msg_from -> msg_to into pmsgtable & psubtable. When */ +/* reading the latest index file (containing msg_latest) it locks the */ +/* directory, unless it is already locked (as in digest creation). */ +/* msgtable has the subject number 1.. (0 if there is no subject match, */ +/* which should happen only if the subject index is corrupt.) */ + +/* 19971107 Changed to deal with index files that are missing, or have */ +/* missing entries, not necessarily reflecting missing archive files. */ +/* This all to make ezmlm-get more robust to get maximal info out of */ +/* corrupted archives. */ + + msgentry **pmsgtable; /* table of message<->subject */ + subentry **psubtable; /* subject no, len, str char * */ + authentry **pauthtable; /* author no, len, str char * */ + dateentry **pdatetable; /* message per date */ + unsigned long msg_from; /* first message in range */ + unsigned long msg_to; /* last message in range */ + unsigned long msg_latest; /* latest message in archive (for locking) */ + int locked; /* if already locked */ + char *fatal; /* Program-specific */ + +{ + unsigned long idxlatest; /* need to lock for this (last) index file */ + unsigned long msg; /* current msg number */ + unsigned long endmsg; /* max msg in this idx file */ + unsigned long tmpmsg; /* index entry's msg number */ + unsigned long idx; /* current index file no */ + unsigned long idxto; /* index containing end of range */ + unsigned long ulmrange; /* total # of messages in range */ + char *subject; /* subject on line */ + unsigned int sublen; /* length of subject */ + char *auth; + unsigned int authlen; + unsigned int pos,posa; + unsigned long submax; /* max subject num in subtable */ + subentry *psubnext; /* points to next entry in subtable */ + subentry *psubt; /* points to entry in subtable */ + authentry *pauthnext; /* points to next entry in authtable */ + authentry *pautht; /* points to entry in authtable */ + int fd; /* index file handle */ + int flagmissingindex; /* current index file is missing */ + int flagauth; /* read index entry has author info */ + int hasauth; /* current msg's entry has author info */ + msgentry *pmsgt; + int res; + int match; + unsigned int datepos,datemax; + unsigned int datetablesize,datetableunit; + unsigned int lastdate = 0; + unsigned int thisdate; + register msgentry *x, *y; + + /* a few unnecessary sanity checks */ + if (msg_to > msg_latest) + msg_to = msg_latest; + if (msg_to < msg_from) + strerr_die2x(100,fatal,"Program error: bad range in idx_mkthreads"); + ulmrange = msg_to - msg_from + 1; + if (!(*pmsgtable = (msgentry *) alloc(ulmrange * sizeof(msgentry)))) + die_nomem(fatal); + y = *pmsgtable; + x = y + ulmrange; /* clear */ + while (--x >= y) { + x->subnum = 0; + x->authnum = 0; + x->date = 0; + } + /* max entries - acceptable waste for now */ + if (!(*psubtable = (subentry *) alloc((ulmrange+1) * sizeof(subentry)))) + die_nomem(fatal); + + if (!(*pauthtable = (authentry *) alloc((ulmrange+1) * sizeof(authentry)))) + die_nomem(fatal); + datetableunit = DATENO * sizeof(dateentry); + datetablesize = datetableunit; + if (!(*pdatetable = (dateentry *) alloc(datetablesize))) + die_nomem(fatal); + datepos = 0; + datemax = DATENO - 2; /* entry 0 and end marker */ + lastdate = 0; + + idxlatest = msg_latest / 100; + idxto = msg_to / 100; + submax = 0; + psubnext = *psubtable; /* dummy node to get tree going. Basically, */ + psubt = &sdummy; /* assure that subject > psubt-sub and that */ + init_dummy(fatal); /* below ok unless HASHLEN > 40 */ + psubt->sub = " "; + psubt->sublen = 40; /* there is something to hold psubt->higher */ + psubt->higher = (subentry *) 0; + psubt->lower = (subentry *) 0; + pauthnext = *pauthtable; + pautht = &adummy; + pautht->auth = psubt->sub; + pautht->authlen = psubt->sublen; + pautht->higher = (authentry *) 0; + pautht->lower = (authentry *) 0; + for (idx = msg_from / 100; idx <= idxto; idx++) { + /* make index file name */ + if (!stralloc_copys(&line,"archive/")) die_nomem(fatal); + if (!stralloc_catb(&line,strnum,fmt_ulong(strnum,idx))) die_nomem(fatal); + if (!stralloc_cats(&line,"/index")) die_nomem(fatal); + if (!stralloc_0(&line)) die_nomem(fatal); + if (!locked && idx == idxlatest) + lockup(fatal); + flagmissingindex = 0; + fd = open_read(line.s); + if (fd == -1) { + if (errno == error_noent) { /* this means the index is not here */ + /* but the lists is supposedly indexed*/ + flagmissingindex = 1; + } else + strerr_die4sys(111,fatal,ERR_OPEN,line.s,": "); + } else + substdio_fdbuf(&ssindex,read,fd,indexbuf,sizeof(indexbuf)); + + msg = 100L * idx; /* current msg# */ + endmsg = msg + 99L; /* max msg in this index */ + if (!msg) msg = 1L; /* for start to make msg > tmpmsg */ + tmpmsg = 0L; /* msg number of read index line */ + if (endmsg > msg_to) /* skip non-asked for subjects */ + endmsg = msg_to; + for (; msg <= endmsg; msg++) { + if (!flagmissingindex && (msg > tmpmsg)) { + flagauth = 0; + if (getln(&ssindex,&line,&match,'\n') == -1) + strerr_die3sys(111,fatal,ERR_READ,"index: "); + if (!match) + flagmissingindex = 1; + else { + pos = scan_ulong(line.s,&tmpmsg); + if (line.s[pos++] == ':') { + if (getln(&ssindex,&authline,&match,'\n') == -1) + strerr_die3sys(111,fatal,ERR_READ,"index: "); + if (!match) + flagmissingindex = 1; + else { + flagauth = 1; + } + pos++; + } + } + } + if (msg < msg_from) /* Nothing before start of range */ + continue; + if (msg == tmpmsg) { + subject = line.s + pos; + sublen = line.len - pos; + if (sublen <= HASHLEN) + strerr_die2x(100,fatal,ERR_BAD_INDEX); + hasauth = flagauth; + } else { + subject = dummyind.s; + sublen = dummyind.len; + hasauth = 0; + } + for(;;) { /* search among already known subjects */ + res = str_diffn(psubt->sub,subject,HASHLEN); + if (res < 0) { + if (psubt->higher) + psubt = psubt->higher; + else { + newsub(psubnext,subject,sublen,msg,fatal); + psubt->higher = psubnext; + psubt = psubnext; + psubnext++; + break; + } + } else if (res > 0) { + if (psubt->lower) + psubt = psubt->lower; + else { + newsub(psubnext,subject,sublen,msg,fatal); + psubt->lower = psubnext; + psubt = psubnext; + psubnext++; + break; + } + } else { + psubt->lastmsg = msg; + (psubt->msginthread)++; /* one more message in thread */ + break; + } + } + /* first subnum =1 (=0 is empty for thread) */ + pmsgt = *pmsgtable + msg - msg_from; + pmsgt->subnum = (unsigned int) (psubt - *psubtable + 1); + pmsgt->date = lastdate; + if (hasauth) { + pos = 0; + while (authline.s[pos] && authline.s[pos] != ' ') pos++; + if (authline.s[++pos]) { + thisdate = date2yyyymm(authline.s + pos); + if (thisdate) pmsgt->date = thisdate; + if (pmsgt->date > lastdate) { + lastdate = pmsgt->date; + if (datepos >= datemax) { /* more space */ + datemax += DATENO; + if (!(*pdatetable = (dateentry *) alloc_re(*pdatetable, + datetablesize,datetablesize+datetableunit))) + die_nomem(fatal); + } + (*pdatetable)[datepos].msg = msg; /* first msg this mo */ + (*pdatetable)[datepos].date = lastdate; + datepos++; + } + posa = byte_chr(authline.s,authline.len,';'); + if (authline.len > posa + HASHLEN + 1 && authline.s[pos+1] != ' ') { + /* old: "; auth", new: ";hash auth" */ + auth = authline.s + posa + 1; + authlen = authline.len - posa - 1; + } else { + auth = dummyind.s; + authlen = dummyind.len; + } + } + /* allright! Same procedure, but for author */ + for (;;) { /* search among already known authors */ + res = str_diffn(pautht->auth,auth,HASHLEN); + if (res < 0) { + if (pautht->higher) + pautht = pautht->higher; + else { + newauth(pauthnext,auth,authlen,msg,fatal); + pautht->higher = pauthnext; + pautht = pauthnext; + pauthnext++; + break; + } + } else if (res > 0) { + if (pautht->lower) + pautht = pautht->lower; + else { + newauth(pauthnext,auth,authlen,msg,fatal); + pautht->lower = pauthnext; + pautht = pauthnext; + pauthnext++; + break; + } + } else { + break; + } + } /* link from message to this author */ + pmsgt->authnum = (unsigned int) (pautht - *pauthtable + 1); + pautht = *pauthtable; + } + + psubt = *psubtable; /* setup psubt. Done here rather than before */ + /* the for loop, so that we can start off */ + /* the dummy node. */ + } + if (fd != -1) + close(fd); + if (!locked && idx == idxlatest) + unlock(); /* 'locked' refers to locked before calling */ + } + psubnext->sub = (char *) 0; /* end of table marker */ + pauthnext->auth = (char *) 0; /* end of table marker */ + (*pdatetable)[datepos].msg = msg_to + 1; + (*pdatetable)[datepos].date = lastdate + 1; +} + + +void idx_mkthread(pmsgtable,psubtable,pauthtable,msg_from,msg_to,msg_master, + msg_latest,locked,fatal) +/* Works like idx_mkthreads, except that it finds the subject for message */ +/* msg_master, then identifies messages in the range that have the same */ +/* subject. msgtable entries with subject 0 do not match, with '1' do match.*/ + +msgentry **pmsgtable; /* pointer to table of message<->subject */ +subentry **psubtable; /* ptr to tbl of subject no, len, str char * */ +authentry **pauthtable; +unsigned long msg_from; /* first message in range */ +unsigned long msg_to; /* last message in range */ +unsigned long msg_latest; /* latest message in archive (for locking) */ +unsigned long msg_master; /* master message for single thread, else 0*/ +int locked; /* if already locked */ +char *fatal; /* Program-specific */ + +{ + unsigned long idxlatest; /* need to lock for this (last) index file */ + unsigned long idxto; /* index for last msg in range */ + unsigned long idx; /* current index file no */ + unsigned long msg; /* index entry's msg number */ + unsigned long ulmrange; /* total # of messages in range */ + subentry *psubt; /* points to last entry in subtable */ + int ffound; /* msg subject was found in subtable */ + int flagauth; /* there is author info */ + int firstfound = 1; /* = 1 until first message in thread found */ + int res; /* comparison result */ + char *auth; + unsigned int authlen; + authentry *pauthnext; /* points to next entry in authtable */ + authentry *pautht; /* points to entry in authtable */ + unsigned int pos; + int fd; /* index file handle */ + int match; + msgentry *pmsgt; + register msgentry *x,*y; + + if ((ulmrange = msg_to - msg_from +1) <= 0) + strerr_die2x(100,fatal,"Program error: bad range in idx_mkthreads"); + if (!(*pmsgtable = (msgentry *) alloc(ulmrange * sizeof(msgentry)))) + die_nomem(fatal); + y = *pmsgtable; + x = y + ulmrange; + while (--x >= y) { + x->subnum = 0; + x->authnum = 0; + x->date = 0; + } + + if (!(*psubtable = (subentry *) alloc(2 * sizeof(subentry)))) + die_nomem(fatal); + + if (!(*pauthtable = (authentry *) alloc((ulmrange + 1) * sizeof(authentry)))) + die_nomem(fatal); + + pauthnext = *pauthtable; + pautht = &adummy; + init_dummy(); + pautht->auth = " "; + pautht->authlen = 21; + pautht->higher = (authentry *) 0; + pautht->lower = (authentry *) 0; + idxlatest = msg_latest / 100; + idxto = msg_to / 100; + idx = msg_master / 100; /* index for master subject */ + + /* Get master subject */ + if (!stralloc_copys(&line,"archive/")) die_nomem(fatal); + if (!stralloc_catb(&line,strnum,fmt_ulong(strnum,idx))) die_nomem(fatal); + if (!stralloc_cats(&line,"/index")) die_nomem(fatal); + if (!stralloc_0(&line)) die_nomem(fatal); + ffound = 0; + if (!locked && idx == idxlatest) + lockup(fatal); + fd = open_read(line.s); + psubt = *psubtable; + if (fd == -1) { + if (errno != error_noent) + strerr_die4sys(111,fatal,ERR_OPEN,line.s,": "); + else + strerr_die2x(111,fatal,ERR_NOINDEX); /* temp - admin can fix! */ + } else { + substdio_fdbuf(&ssindex,read,fd,indexbuf,sizeof(indexbuf)); + for(;;) { + if (getln(&ssindex,&line,&match,'\n') == -1) + strerr_die3sys(111,fatal,ERR_OPEN,"index: "); + if (!match) + break; + pos=scan_ulong(line.s,&msg); + if (line.s[pos++] == ':') { /* marker for author info */ + pos++; + flagauth = 1; + } else + flagauth = 0; + if (msg == msg_master) { + newsub(psubt,line.s+pos,line.len-pos,msg,fatal); + /* need to update msg later! */ + ffound = 1; + break; + } + if (flagauth) { /* skip author line */ + if (getln(&ssindex,&line,&match,'\n') == -1) + strerr_die3sys(111,fatal,ERR_OPEN,"index: "); + if (!match) + break; + } + } + close(fd); + } + if (!locked && idx == idxlatest) + unlock(); + if (!ffound) + strerr_die2x(100,fatal,ERR_NOINDEX); + for (idx = msg_from / 100; idx <= idxto; idx++) { + /* make index file name */ + if (!stralloc_copys(&line,"archive/")) die_nomem(fatal); + if (!stralloc_catb(&line,strnum,fmt_ulong(strnum,idx))) die_nomem(fatal); + if (!stralloc_cats(&line,"/index")) die_nomem(fatal); + if (!stralloc_0(&line)) die_nomem(fatal); + if (!locked && idx == idxlatest) + lockup(fatal); + fd = open_read(line.s); + if (fd == -1) { + if (errno != error_noent) + strerr_die4sys(111,fatal,ERR_OPEN,line.s,": "); + } else { + substdio_fdbuf(&ssindex,read,fd,indexbuf,sizeof(indexbuf)); + for(;;) { + if (getln(&ssindex,&line,&match,'\n') == -1) + strerr_die3sys(111,fatal,ERR_READ,"index: "); + if (!match) + break; + pos=scan_ulong(line.s,&msg); + if (line.s[pos++] == ':') { + pos++; + flagauth = 1; + if (getln(&ssindex,&authline,&match,'\n') == -1) + strerr_die3sys(111,fatal,ERR_READ,"index: "); + if (!match) + break; + } else + flagauth = 0; + if (msg < msg_from) /* Nothing before start of range */ + continue; + if (msg > msg_to) /* Don't do anything after range */ + break; + if (!str_diffn(psubt->sub,line.s+pos,HASHLEN)) { + pmsgt = *pmsgtable + msg - msg_from; + if (firstfound) { /* update to first message with this subj */ + psubt->firstmsg = msg; + firstfound = 0; + } + psubt->lastmsg = msg; + pmsgt->subnum = 1; + if (flagauth) { + if (*authline.s) + pmsgt->date = date2yyyymm(authline.s + 1); + pos = byte_chr(authline.s,authline.len,';'); + if (authline.len > pos + HASHLEN + 1 && authline.s[pos+1] != ' ') { + /* old: "; auth", new: ";hash auth" */ + auth = authline.s + pos + 1; + authlen = authline.len - pos - 1; + } else { + auth = dummyind.s; + authlen = dummyind.len; + } + for (;;) { /* search among already known authors */ + res = str_diffn(pautht->auth,auth,HASHLEN); + if (res < 0) { + if (pautht->higher) + pautht = pautht->higher; + else { + newauth(pauthnext,auth,authlen,msg,fatal); + pautht->higher = pauthnext; + pautht = pauthnext; + pauthnext++; + break; + } + } else if (res > 0) { + if (pautht->lower) + pautht = pautht->lower; + else { + newauth(pauthnext,auth,authlen,msg,fatal); + pautht->lower = pauthnext; + pautht = pauthnext; + pauthnext++; + break; + } + } else { + break; + } + } /* link from message to this author */ + pmsgt->authnum = (unsigned int) (pautht - *pauthtable + 1); + pautht = *pauthtable; + } + + } + } + close(fd); + } + if (!locked && idx == idxlatest) + unlock(); + } + ++psubt; + psubt->sub = (char *) 0; /* end of table marker */ + pauthnext->auth = (char *) 0; /* end of table marker */ +} + +void idx_mklist(pmsgtable,psubtable,pauthtable,msg_from,msg_to,fatal) +/* Like mkthreads, except that it works without a subject index. The result */ +/* is just a dummy subject and a sequential list of messages. This to allow */ +/* use of the same routines when creating digest from lists that have no */ +/* subject index (for whatever reason). */ +msgentry **pmsgtable; /* pointer to table of message<->subject */ +subentry **psubtable; /* ptr to tbl of subject no, len, str char * */ +authentry **pauthtable; +unsigned long msg_from; /* first message in range */ +unsigned long msg_to; /* last message in range */ +char *fatal; /* Program-specific */ +{ + unsigned long ulmrange; + register msgentry *x,*y; + subentry *psubt; + authentry *pautht; + + if ((ulmrange = msg_to - msg_from +1) <= 0) + strerr_die2x(111,fatal,"bad range in idx_mkthreads :"); + + if (!(*pmsgtable = (msgentry *) alloc(ulmrange * sizeof(msgentry)))) + die_nomem(fatal); + + y = *pmsgtable; + x = y + ulmrange; + while (--x >= y) { + x->subnum = 1; + x->authnum = 0; + x->date = 0; + } + + if (!(*psubtable = (subentry *) alloc(2 * sizeof(subentry)))) + die_nomem(fatal); + psubt = *psubtable; + newsub(psubt,dummyind.s,dummyind.len,msg_from,fatal); + psubt->lastmsg = msg_to; + ++psubt; + psubt->sub = (char *) 0; + if (!(*pauthtable = (authentry *) alloc(sizeof(authentry)))) + die_nomem(fatal); /* nodata. Avoid dangling ptr. */ + pautht = *pauthtable; + pautht->auth = 0; /* tells app that there are no author data */ + pautht->higher = (authentry *) 0; + pautht->lower = (authentry *) 0; +} + +void idx_destroythread(msgtable,subtable,authtable) +/* Frees space allocated by idxthread routines. This is needed only if */ +/* one does several threadings in one program run. Otherwise, exit() */ +/* should free all allocated memory, which will be faster. */ +msgentry *msgtable; subentry *subtable; authentry *authtable; +{ + subentry *psubt; + authentry *pautht; + + psubt = subtable; /* free subjects */ + while(psubt->sub) { + alloc_free(psubt->sub); + psubt++; + } + + pautht = authtable; /* free authors */ + while(pautht->auth) { + alloc_free(pautht->auth); + pautht++; + } + + alloc_free(subtable); /* free subtable */ + alloc_free(authtable); /* free authtable */ + alloc_free(msgtable); /* free msgtable */ + subtable = (subentry *) 0; /* kill pointers */ + authtable = (authentry *) 0; + msgtable = (msgentry *) 0; +}