TODO list for agedu
===================
- - stop trying to calculate an upper bound on the index file size.
- Instead, just mmap it at initial size + delta, and periodically
- re-mmap it during index building if it grows too big. If we run
- out of address space, we'll hear about it eventually; and
- computing upper bounds given the new optimised index tends to be
- a factor of five out, which is bad because it'll lead to running
- out of theoretical address space and erroneously reporting
- failure long before we run out of it for real.
-
- we could still be using more of the information coming from
autoconf. Our config.h is defining a whole bunch of HAVE_FOOs for
particular functions (e.g. HAVE_INET_NTOA, HAVE_MEMCHR,
how to make the indexers pluggable at both index-generation time
and query time.
* however, now we have the cut-down version of the continuous
- index, it might be the case that the space gain is no longer
- worthwhile.
+ index, the space saving is less compelling.
requires \cw{O(N log N)} storage. This is larger than you might
expect; a scan of my own home directory, containing half a million
files and directories and about 20Gb of data, produced an index file
-nearly a third of a Gb in size. Furthermore, since the data file
-must be memory-mapped during most processing, it can never grow
-larger than available address space, which means that any use of
-\cw{agedu} on a file system more than about ten times the above size
-is probably going to have to be done on a 64-bit computer.
+over 60Mb in size. Furthermore, since the data file must be
+memory-mapped during most processing, it can never grow larger than
+available address space, so a \em{really} big filesystem may need to
+be indexed on a 64-bit computer. (This is one reason for the
+existence of the \cw{-D} and \cw{-L} options: you can do the
+scanning on the machine with access to the filesystem, and the
+indexing on a machine big enough to handle it.)
The data structure also does not usefully permit access control
within the data file, so it would be difficult \dash even given the
HELPOPT("[--scan,--load] fake atimes on directories") \
NOVAL(MTIME) LONG(mtime) \
HELPOPT("[--scan] use mtime instead of atime") \
- NOVAL(FULL) LONG(full_index) \
- HELPOPT("[--scan] index every file individually") \
VAL(AGERANGE) SHORT(r) LONG(age_range) LONG(range) LONG(ages) \
HELPARG("age[-age]") HELPOPT("[--web,--html] set limits of colour coding") \
VAL(SERVERADDR) LONG(address) LONG(addr) LONG(server_address) \
int tqdepth = 1;
int fakediratimes = 1;
int mtime = 0;
- int fullindex = 0;
#ifdef DEBUG_MAD_OPTION_PARSING_MACROS
{
case OPT_MTIME:
mtime = 1;
break;
- case OPT_FULL:
- fullindex = 1;
- break;
case OPT_DATAFILE:
filename = optval;
break;
}
if (mode != SCANDUMP) {
size_t maxpathlen;
+ size_t delta;
char *buf, *prevbuf;
count = triebuild_finish(ctx->tb);
return 1;
}
- printf("Built pathname index, %d entries, %llu bytes\n", count,
+ printf("Built pathname index, %d entries,"
+ " %llu bytes of index\n", count,
(unsigned long long)st.st_size);
- totalsize = index_compute_size(st.st_size, count);
+ totalsize = index_initial_size(st.st_size, count);
+ totalsize += totalsize / 10;
if (lseek(fd, totalsize-1, SEEK_SET) < 0) {
perror(PNAME ": lseek");
return 1;
}
- printf("Upper bound on index file size = %llu bytes\n",
- (unsigned long long)totalsize);
-
mappedfile = mmap(NULL, totalsize, PROT_READ|PROT_WRITE,MAP_SHARED, fd, 0);
if (!mappedfile) {
perror(PNAME ": mmap");
}
printf("Building index\n");
- ib = indexbuild_new(mappedfile, st.st_size, count);
+ ib = indexbuild_new(mappedfile, st.st_size, count, &delta);
maxpathlen = trie_maxpathlen(mappedfile);
buf = snewn(maxpathlen, char);
prevbuf = snewn(maxpathlen, char);
while (1) {
int i;
+ if (totalsize - indexbuild_realsize(ib) < delta) {
+ /*
+ * Unmap the file, grow it, and remap it.
+ */
+ munmap(mappedfile, totalsize);
+
+ totalsize += delta;
+ totalsize += totalsize / 10;
+
+ if (lseek(fd, totalsize-1, SEEK_SET) < 0) {
+ perror(PNAME ": lseek");
+ return 1;
+ }
+ if (write(fd, "\0", 1) < 1) {
+ perror(PNAME ": write");
+ return 1;
+ }
+
+ mappedfile = mmap(NULL, totalsize, PROT_READ|PROT_WRITE,MAP_SHARED, fd, 0);
+ if (!mappedfile) {
+ perror(PNAME ": mmap");
+ return 1;
+ }
+
+ indexbuild_rebase(ib, mappedfile);
+ triewalk_rebase(tw, mappedfile);
+ }
+
/*
* Get the next file from the index. So we are
* currently holding, and have not yet
indexbuild_tag(ib);
break;
}
-
- /*
- * In full-index mode, index everything.
- */
- if (fullindex)
- indexbuild_tag(ib);
/*
* If prevbuf was a filename inside some
munmap(mappedfile, totalsize);
ftruncate(fd, realsize);
close(fd);
- printf("Actual index file size = %llu bytes\n",
+ printf("Final index file size = %llu bytes\n",
(unsigned long long)realsize);
}
} else if (mode == TEXT) {
unsigned long long totalsize;
};
-static int index_navlnodes(int nodecount)
+/*
+ * Determine the maximum depth of an AVL tree containing a certain
+ * number of nodes.
+ */
+static int index_maxdepth(int nodecount)
{
- int b, c, maxdepth, ret;
+ int b, c, maxdepth;
/*
* Model the tree growing at maximum imbalance. We do this by
* (i.e. smallest) tree of any given depth, and stopping when
* that's larger than nodecount.
*/
- ret = 0;
maxdepth = 1;
b = 0;
c = 1;
while (b <= nodecount) {
int tmp;
- /*
- * A tree with at most b nodes can be at most depth
- * maxdepth-1. A tree with more than b but at most c can
- * be at most maxdepth. Therefore, for each tree size
- * between b and c, we might need to adjust maxdepth+1
- * nodes.
- */
- tmp = min(c, nodecount);
- ret += (tmp - b) * maxdepth;
-
tmp = 1 + b + c;
b = c;
c = tmp;
maxdepth++;
}
- return ret;
+ return maxdepth;
}
-off_t index_compute_size(off_t currentsize, int nodecount)
+off_t index_initial_size(off_t currentsize, int nodecount)
{
currentsize += PADDING(currentsize, alignof(off_t));
currentsize += nodecount + sizeof(off_t);
currentsize += PADDING(currentsize, alignof(struct avlnode));
- currentsize += index_navlnodes(nodecount) * sizeof(struct avlnode);
return currentsize;
}
struct indexbuild {
void *t;
- int n, nnodes, maxnodes;
+ int n, nnodes;
struct avlnode *nodes;
off_t *roots;
struct avlnode *currroot;
((node) ? (off_t)((const char *)node - (const char *)t) : 0)
#define MAXDEPTH(node) ((node) ? (node)->maxdepth : 0)
-indexbuild *indexbuild_new(void *t, off_t startoff, int nodecount)
+indexbuild *indexbuild_new(void *t, off_t startoff, int nodecount,
+ size_t *delta)
{
indexbuild *ib = snew(indexbuild);
startoff += nodecount * sizeof(off_t);
startoff += PADDING(startoff, alignof(struct avlnode));
ib->nodes = (struct avlnode *)((char *)t + startoff);
- ib->maxnodes = index_navlnodes(nodecount);
ib->nnodes = ib->n = 0;
ib->currroot = NULL;
ib->firstmutable = ib->nodes + ib->nnodes;
+ if (delta)
+ *delta = sizeof(struct avlnode) * (1 + index_maxdepth(nodecount));
+
return ib;
}
if (n && n >= ib->firstmutable)
return n; /* already mutable */
- assert(ib->nnodes < ib->maxnodes);
newnode = ib->nodes + ib->nnodes++;
if (n)
*newnode = *n; /* structure copy */
ib->firstmutable = ib->nodes + ib->nnodes;
}
+void indexbuild_rebase(indexbuild *ib, void *t)
+{
+ ptrdiff_t diff = (unsigned char *)t - (unsigned char *)(ib->t);
+
+ ib->t = t;
+ ib->nodes = (struct avlnode *)((unsigned char *)ib->nodes + diff);
+ ib->roots = (off_t *)((unsigned char *)ib->roots + diff);
+ ib->currroot = (struct avlnode *)((unsigned char *)ib->currroot + diff);
+ ib->firstmutable = (struct avlnode *)((unsigned char *)ib->firstmutable + diff);
+}
+
off_t indexbuild_realsize(indexbuild *ib)
{
return OFFSET(ib->t, (ib->nodes + ib->nnodes));
/*
* Given the size of an existing data file and the number of
- * entries required in the index, calculate and return an upper
- * bound on the required size of the index file after an index has
- * been written on to the end of it.
+ * entries required in the index, return the file size required to
+ * store the fixed initial fragment of the index before the
+ * indexbuild functions are invoked.
*/
-off_t index_compute_size(off_t currentsize, int nodecount);
+off_t index_initial_size(off_t currentsize, int nodecount);
/*
* Build an index, given the address of a memory-mapped data file
* trie_file structures passed to tf must of course be within the
* bounds of the memory-mapped file.
*
+ * The "delta" parameter to indexbuild_new is filled in with the
+ * maximum size by which the index can ever grow in a single
+ * indexbuild_add call. This can be used, together with
+ * indexbuild_realsize, to detect when the index is about to get
+ * too big for its file and the file needs resizing.
+ *
+ * indexbuild_add adds a trie_file structure to the index.
+ * indexbuild_tag, called after that, causes the current state of
+ * the index to be preserved so that it can be queried later. It's
+ * idempotent, and will safely do nothing if called before
+ * indexbuild_add.
+ *
* indexbuild_realsize returns the total amount of data _actually_
- * written into the file, to allow it to be truncated afterwards.
+ * written into the file, to allow it to be truncated afterwards,
+ * and to allow the caller of the indexbuild functions to know
+ * when the file is about to need to grow bigger during index
+ * building.
+ *
+ * indexbuild_rebase is used when the file has been
+ * re-memory-mapped at a different address (because it needs to
+ * grow).
*/
typedef struct indexbuild indexbuild;
-indexbuild *indexbuild_new(void *t, off_t startoff, int nodecount);
+indexbuild *indexbuild_new(void *t, off_t startoff, int nodecount,
+ size_t *delta);
void indexbuild_add(indexbuild *ib, const struct trie_file *tf);
void indexbuild_tag(indexbuild *ib);
+void indexbuild_rebase(indexbuild *ib, void *t);
off_t indexbuild_realsize(indexbuild *ib);
void indexbuild_free(indexbuild *ib);
return tw;
}
+
+void triewalk_rebase(triewalk *tw, const void *t)
+{
+ ptrdiff_t diff = ((const unsigned char *)t - (const unsigned char *)(tw->t));
+ int i;
+
+ tw->t = t;
+
+ for (i = 0; i < tw->nswitches; i++)
+ tw->switches[i].sw = (const struct trie_switch *)
+ ((const unsigned char *)(tw->switches[i].sw) + diff);
+}
+
const struct trie_file *triewalk_next(triewalk *tw, char *buf)
{
off_t off;
typedef struct triewalk triewalk;
triewalk *triewalk_new(const void *t);
const struct trie_file *triewalk_next(triewalk *tw, char *buf);
+void triewalk_rebase(triewalk *tw, const void *t);
void triewalk_free(triewalk *tw);
/* ----------------------------------------------------------------------