From: simon Date: Sun, 2 Nov 2008 13:00:59 +0000 (+0000) Subject: Invent a cunning means of faking plausible atimes for directories, X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/agedu/commitdiff_plain/05b0f827bf0178896f9484fe093fb0d99fa67ba2 Invent a cunning means of faking plausible atimes for directories, since directories will tend to have been accessed constantly by other recursive disk scans and hence not really reflect the semantic last-usage status of the stuff within them. Directories' atimes are now computed as the maximum of their mtime and all the atimes below them. (There's a command-line option to revert to the obvious behaviour, but I think it is not in general the most useful thing.) git-svn-id: svn://svn.tartarus.org/sgt/agedu@8255 cda61777-01e9-0310-a592-d414129be87e --- diff --git a/TODO b/TODO index e742e4b..4858350 100644 --- a/TODO +++ b/TODO @@ -3,9 +3,6 @@ TODO list for agedu Before it's non-embarrassingly releasable: - - arrange to be able to identify directories and leave them on one - side of the usual age display - - cross-Unix portability: + use autoconf * configure use of stat64 @@ -16,6 +13,16 @@ Before it's non-embarrassingly releasable: too, if it's available and O_NOATIME is too. * what do we do elsewhere about _GNU_SOURCE? + - tweak the short options. I think dump files should be a + second-class feature in general, so that --dump and --load should + be represented by capital options. That leaves -d free to be + --depth, which I think is more generally useful. + * while I'm tweaking the options list, "[all modes]" ought to + say "[most modes]", due to --scan-dump. + + - New mode: --remove, to destroy the data file. Handy for + totally self-contained usage: "-s . -w -R". + - man page, --version. Future possibilities: diff --git a/agedu.c b/agedu.c index 68a9799..74d1971 100644 --- a/agedu.c +++ b/agedu.c @@ -66,6 +66,7 @@ struct ctx { struct inclusion_exclusion *inex; int ninex; int crossfs; + int fakeatimes; }; static void dump_line(const char *pathname, const struct trie_file *tf) @@ -102,7 +103,10 @@ static int gotdata(void *vctx, const char *pathname, const struct stat64 *st) return 0; file.size = (unsigned long long)512 * st->st_blocks; - file.atime = st->st_atime; + if (ctx->fakeatimes && S_ISDIR(st->st_mode)) + file.atime = st->st_mtime; + else + file.atime = st->st_atime; /* * Filter based on wildcards. @@ -302,6 +306,10 @@ static void text_query(const void *mappedfile, const char *querydir, HELPARG("wildcard") HELPOPT("[--scan] prune files matching pattern") \ VAL(PRUNEPATH) LONG(prune_path) \ HELPARG("wildcard") HELPOPT("[--scan] prune pathnames matching pattern") \ + NOVAL(DIRATIME) LONG(dir_atime) LONG(dir_atimes) \ + HELPOPT("[--scan] keep real atimes on directories") \ + NOVAL(NODIRATIME) LONG(no_dir_atime) LONG(no_dir_atimes) \ + HELPOPT("[--scan] fake atimes on directories") \ VAL(TQDEPTH) LONG(depth) LONG(max_depth) LONG(maximum_depth) \ HELPARG("levels") HELPOPT("[--text] recurse to this many levels") \ VAL(MINAGE) SHORT(a) LONG(age) LONG(min_age) LONG(minimum_age) \ @@ -462,6 +470,7 @@ int main(int argc, char **argv) int ninex = 0, inexsize = 0; int crossfs = 0; int tqdepth = 1; + int fakediratimes = 1; #ifdef DEBUG_MAD_OPTION_PARSING_MACROS { @@ -692,6 +701,12 @@ int main(int argc, char **argv) case OPT_NOCROSSFS: crossfs = 0; break; + case OPT_DIRATIME: + fakediratimes = 0; + break; + case OPT_NODIRATIME: + fakediratimes = 1; + break; case OPT_DATAFILE: filename = optval; break; @@ -893,6 +908,7 @@ int main(int argc, char **argv) ctx->inex = inex; ctx->ninex = ninex; ctx->crossfs = crossfs; + ctx->fakeatimes = fakediratimes; ctx->last_output_update = time(NULL); @@ -1018,6 +1034,12 @@ int main(int argc, char **argv) return 1; } + if (fakediratimes) { + printf("Faking directory atimes\n"); + trie_fake_dir_atimes(mappedfile); + } + + printf("Building index\n"); ib = indexbuild_new(mappedfile, st.st_size, count); tw = triewalk_new(mappedfile); while ((tf = triewalk_next(tw, NULL)) != NULL) diff --git a/trie.c b/trie.c index 7168a9e..1344bdf 100644 --- a/trie.c +++ b/trie.c @@ -392,6 +392,117 @@ void triebuild_free(triebuild *tb) } /* ---------------------------------------------------------------------- + * Memory-mapped trie modification. + */ + +#define MNODE(t, off, type) \ + ((struct type *)((char *)(t) + (off))) + +static unsigned long long fake_atime_recurse(void *t, struct trie_common *node, + int last_seen_pathsep) +{ + while (node->type == TRIE_STRING) { + struct trie_string *st = (struct trie_string *)node; + last_seen_pathsep = (st->string[st->stringlen-1] == pathsep); + node = MNODE(t, st->subnode, trie_common); + } + + if (node->type == TRIE_LEAF) { + struct trie_leaf *leaf = (struct trie_leaf *)node; + return leaf->file.atime; + } else if (assert(node->type == TRIE_SWITCH), 1) { + struct trie_switch *sw = (struct trie_switch *)node; + const char *chars = (const char *)&sw->sw[sw->len]; + unsigned long long max = 0, subdir, ret; + int i; + int slashindex = -1, bareindex = -1; + + /* + * First, process all the children of this node whose + * switch characters are not \0 or pathsep. We do this in + * reverse order so as to maintain best cache locality + * (tracking generally backwards through the file), though + * it doesn't matter semantically. + * + * For each of these children, we're just recursing into + * it to do any fixups required below it, and amalgamating + * the max atimes we get back. + */ + for (i = sw->len; i-- > 0 ;) { + if (chars[i] == '\0') { + bareindex = i; + } else if (chars[i] == pathsep) { + slashindex = i; + } else { + ret = fake_atime_recurse(t, MNODE(t, sw->sw[i].subnode, + trie_common), 0); + if (max < ret) + max = ret; + } + } + + /* + * Now we have at most two child nodes left to deal with: + * one with a slash (or general pathsep) and one with \0. + * + * If there's a slash node and a bare node, then the slash + * node contains precisely everything inside the directory + * described by the bare node; so we must retrieve the max + * atime for the slash node and use it to fix up the bare + * node. + * + * If there's only a bare node but the pathname leading up + * to this point ends in a slash, then _all_ of the child + * nodes of this node contain stuff inside the directory + * described by the bare node; so we use the whole of the + * maximum value we've computed so far to update the bare + * node. + */ + if (slashindex >= 0) { + ret = fake_atime_recurse(t, MNODE(t, sw->sw[slashindex].subnode, + trie_common), 1); + if (max < ret) + max = ret; + + subdir = ret; + } else if (last_seen_pathsep) { + subdir = max; + } else { + /* Don't update the bare subnode at all. */ + subdir = 0; + } + + if (bareindex >= 0) { + struct trie_leaf *leaf; + + leaf = MNODE(t, sw->sw[bareindex].subnode, trie_leaf); + + if (leaf && leaf->c.type == TRIE_LEAF) { + if (leaf->file.atime < subdir) + leaf->file.atime = subdir; + ret = leaf->file.atime; + } else { + /* Shouldn't really happen, but be cautious anyway */ + ret = fake_atime_recurse(t, &leaf->c, 0); + } + + if (max < ret) + max = ret; + } + + return max; + } +} + +void trie_fake_dir_atimes(void *t) +{ + struct trie_header *hdr = MNODE(t, 0, trie_header); + struct trie_common *node = MNODE(t, hdr->root, trie_common); + + fake_atime_recurse(t, node, 1); +} + +/* ---------------------------------------------------------------------- * Querying functions. */ diff --git a/trie.h b/trie.h index ef52a08..6a6d7ed 100644 --- a/trie.h +++ b/trie.h @@ -50,6 +50,17 @@ int triebuild_finish(triebuild *tb); void triebuild_free(triebuild *tb); /* ---------------------------------------------------------------------- + * Anomalous function which modifies a trie after it's memory-mapped. + */ + +/* + * Invent new fake atimes for each directory in the trie, by + * taking the maximum (latest) of the directory's previously + * stored atime and the atimes of everything below it. + */ +void trie_fake_dir_atimes(void *t); + +/* ---------------------------------------------------------------------- * Functions to query a trie given a pointer to the start of the * memory-mapped file. */