TODO list for agedu
===================
+ - stop trying to calculate an upper bound on the index file size.
+ Instead, just mmap it at initial size + delta, and periodically
+ re-mmap it during index building if it grows too big. If we run
+ out of address space, we'll hear about it eventually; and
+ computing upper bounds given the new optimised index tends to be
+ a factor of five out, which is bad because it'll lead to running
+ out of theoretical address space and erroneously reporting
+ failure long before we run out of it for real.
+
- we could still be using more of the information coming from
autoconf. Our config.h is defining a whole bunch of HAVE_FOOs for
particular functions (e.g. HAVE_INET_NTOA, HAVE_MEMCHR,
controversial; IIRC it's all in POSIX, for one thing. So more
likely this should simply wait until somebody complains.
- - it would be useful to support a choice of indexing strategies.
- The current system's tradeoff of taking O(N log N) space in order
- to be able to support any age cutoff you like is not going to be
- ideal for everybody. A second more conventional mechanism which
- allows the user to specify a number of fixed cutoffs and just
- indexes each directory on those alone would undoubtedly be a
- useful thing for large-scale users. This will require
- considerable thought about how to make the indexers pluggable at
- both index-generation time and query time.
-
- IPv6 support in the HTTP server
* of course, Linux magic auth can still work in this context; we
merely have to be prepared to open one of /proc/net/tcp or
agedu dump file on standard output. Then one would simply feed
that over the network connection of one's choice to the rest
of agedu running on Unix as usual.
+
+ - it might conceivably be useful to support a choice of indexing
+ strategies. The current "continuous index" mechanism' tradeoff of
+ taking O(N log N) space in order to be able to support any age
+ cutoff you like is not going to be ideal for everybody. A second
+ more conventional "discrete index" mechanism which allows the
+ user to specify a number of fixed cutoffs and just indexes each
+ directory on those alone would undoubtedly be a useful thing for
+ large-scale users. This will require considerable thought about
+ how to make the indexers pluggable at both index-generation time
+ and query time.
+ * however, now we have the cut-down version of the continuous
+ index, it might be the case that the space gain is no longer
+ worthwhile.
HELPOPT("[--scan,--load] fake atimes on directories") \
NOVAL(MTIME) LONG(mtime) \
HELPOPT("[--scan] use mtime instead of atime") \
+ NOVAL(FULL) LONG(full_index) \
+ HELPOPT("[--scan] index every file individually") \
VAL(AGERANGE) SHORT(r) LONG(age_range) LONG(range) LONG(ages) \
HELPARG("age[-age]") HELPOPT("[--web,--html] set limits of colour coding") \
VAL(SERVERADDR) LONG(address) LONG(addr) LONG(server_address) \
void *mappedfile;
triewalk *tw;
indexbuild *ib;
- const struct trie_file *tf;
+ const struct trie_file *tf, *prevtf;
char *filename = PNAME ".dat";
int doing_opts = 1;
enum { TEXT, HTML, SCAN, DUMP, SCANDUMP, LOAD, HTTPD, REMOVE };
int tqdepth = 1;
int fakediratimes = 1;
int mtime = 0;
+ int fullindex = 0;
#ifdef DEBUG_MAD_OPTION_PARSING_MACROS
{
case OPT_MTIME:
mtime = 1;
break;
+ case OPT_FULL:
+ fullindex = 1;
+ break;
case OPT_DATAFILE:
filename = optval;
break;
if (mode == SCAN || mode == SCANDUMP || mode == LOAD) {
const char *scandir = actions[action].arg;
+
if (mode == LOAD) {
char *buf = fgetline(stdin);
unsigned newpathsep;
du(scandir, gotdata, ctx);
}
if (mode != SCANDUMP) {
+ size_t maxpathlen;
+ char *buf, *prevbuf;
+
count = triebuild_finish(ctx->tb);
triebuild_free(ctx->tb);
printf("Building index\n");
ib = indexbuild_new(mappedfile, st.st_size, count);
+ maxpathlen = trie_maxpathlen(mappedfile);
+ buf = snewn(maxpathlen, char);
+ prevbuf = snewn(maxpathlen, char);
tw = triewalk_new(mappedfile);
- while ((tf = triewalk_next(tw, NULL)) != NULL)
- indexbuild_add(ib, tf);
+ prevbuf[0] = '\0';
+ tf = triewalk_next(tw, buf);
+ assert(tf);
+ while (1) {
+ int i;
+
+ /*
+ * Get the next file from the index. So we are
+ * currently holding, and have not yet
+ * indexed, prevtf (with pathname prevbuf) and
+ * tf (with pathname buf).
+ */
+ prevtf = tf;
+ memcpy(prevbuf, buf, maxpathlen);
+ tf = triewalk_next(tw, buf);
+
+ if (!tf)
+ buf[0] = '\0';
+
+ /*
+ * Find the first differing character position
+ * between our two pathnames.
+ */
+ for (i = 0; prevbuf[i] && prevbuf[i] == buf[i]; i++);
+
+ /*
+ * If prevbuf was a directory name and buf is
+ * something inside that directory, then
+ * trie_before() will be called on prevbuf
+ * itself. Hence we must drop a tag before it,
+ * so that the resulting index is usable.
+ */
+ if ((!prevbuf[i] && (buf[i] == pathsep ||
+ (i > 0 && buf[i-1] == pathsep))))
+ indexbuild_tag(ib);
+
+ /*
+ * Add prevtf to the index.
+ */
+ indexbuild_add(ib, prevtf);
+
+ if (!tf) {
+ /*
+ * Drop an unconditional final tag, and
+ * get out of this loop.
+ */
+ indexbuild_tag(ib);
+ break;
+ }
+
+ /*
+ * In full-index mode, index everything.
+ */
+ if (fullindex)
+ indexbuild_tag(ib);
+
+ /*
+ * If prevbuf was a filename inside some
+ * directory which buf is outside, then
+ * trie_before() will be called on some
+ * pathname either equal to buf or epsilon
+ * less than it. Either way, we're going to
+ * need to drop a tag after prevtf.
+ */
+ if (strchr(prevbuf+i, pathsep) || !tf)
+ indexbuild_tag(ib);
+ }
+
triewalk_free(tw);
realsize = indexbuild_realsize(ib);
indexbuild_free(ib);
"This is a restricted-access set of pages.");
}
} else {
+ char *q;
p = ctx->url;
p += strspn(p, "/?");
- index = strtoul(p, NULL, 10);
- document = html_query(ctx->t, index, cfg);
- if (document) {
- ret = http_success("text/html", 1, document);
- sfree(document);
- } else {
+ index = strtoul(p, &q, 10);
+ if (*q) {
ret = http_error("404", "Not Found", NULL,
- "Pathname index out of range.");
+ "This is not a valid pathname index.");
+ } else {
+ document = html_query(ctx->t, index, cfg);
+ if (document) {
+ ret = http_success("text/html", 1, document);
+ sfree(document);
+ } else {
+ ret = http_error("404", "Not Found", NULL,
+ "Pathname index out of range.");
+ }
}
}
return ret;
while (fgets(linebuf, sizeof(linebuf), fp)) {
if (strlen(linebuf) >= 75 &&
!strncmp(linebuf+6, matchbuf, strlen(matchbuf))) {
+ fclose(fp);
return atoi(linebuf + 75);
}
}
+ fclose(fp);
}
return -1;
{
off_t node = OFFSET(ib->t, tf);
ib->currroot = avl_insert(ib, ib->currroot, node);
- ib->roots[ib->n++] = OFFSET(ib->t, ib->currroot);
+ ib->roots[ib->n++] = 0;
+}
+
+void indexbuild_tag(indexbuild *ib)
+{
+ if (ib->n > 0)
+ ib->roots[ib->n - 1] = OFFSET(ib->t, ib->currroot);
ib->firstmutable = ib->nodes + ib->nnodes;
}
void indexbuild_free(indexbuild *ib)
{
+ assert(ib->n == trie_count(ib->t));
sfree(ib);
}
if (n > count)
n = count;
+ assert(roots[n-1]);
node = NODE(t, roots[n-1]);
ret = 0;
roots = (const off_t *)((const char *)t + trie_get_index_offset(t));
count = trie_count(t);
+ assert(roots[count-1]);
node = NODE(t, roots[count-1]);
size = node->totalsize * f;
typedef struct indexbuild indexbuild;
indexbuild *indexbuild_new(void *t, off_t startoff, int nodecount);
void indexbuild_add(indexbuild *ib, const struct trie_file *tf);
+void indexbuild_tag(indexbuild *ib);
off_t indexbuild_realsize(indexbuild *ib);
void indexbuild_free(indexbuild *ib);