From cb8be37013a6aefa145183fa3948e3062317203c Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Fri, 6 Mar 2015 18:12:54 +0000 Subject: [PATCH] dedup-{format,sift}.c: Find ways to save space by making hardlinks. Currently a bit work-in-progress. * dedup-format.c reads fshash files and reformats them a little, most importantly by adding the volume label from the fshash file to each line. * dedup-sift.c reads a sorted stream of dedup-formatted records and (currently) writes a report about which additional hardlinks can be made. The tool is very careful not to corrupt the existing hardlink structure in a single volume, and not to hardlink files which are actually different, even if the fshash records are misleading. Documentation is needed before these are ready for prime time. --- Makefile.am | 11 ++ dedup-format.c | 70 ++++++++++++ dedup-sift.c | 337 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 418 insertions(+) create mode 100644 dedup-format.c create mode 100644 dedup-sift.c diff --git a/Makefile.am b/Makefile.am index 90618b8..31f2cba 100644 --- a/Makefile.am +++ b/Makefile.am @@ -72,6 +72,17 @@ sbin_PROGRAMS += rfreezefs man_MANS += rfreezefs.8 endif +EXTRA_PROGRAMS += dedup-format dedup-sift +dedup_format_SOURCES = dedup-format.c +dedup_format_LDADD = $(mLib_LIBS) +dedup_sift_SOURCES = dedup-sift.c +dedup_sift_LDADD = $(mLib_LIBS) +##EXTRA_DIST += dedup-format.8 dedup-sift.8 +if HAVE_MLIB +sbin_PROGRAMS += dedup-format dedup-sift +##man_MANS += dedup-format.8 dedup-sift.8 +endif + pkgdata_DATA += lib.sh CLEANFILES += lib.sh EXTRA_DIST += lib.sh.in diff --git a/dedup-format.c b/dedup-format.c new file mode 100644 index 0000000..c3d0bc3 --- /dev/null +++ b/dedup-format.c @@ -0,0 +1,70 @@ +#include +#include +#include + +#include +#include +#include +#include + +struct field { + const char *p; + size_t n; +}; + +int main(int argc, char *argv[]) +{ + int i, j; + FILE *fp; + dstr d = DSTR_INIT, dd = DSTR_INIT; + const char *fn, *p, *q; + size_t tagsz; + struct field f[7]; + + ego(argv[0]); + + for (i = 1; i < argc; i++) { + fn = argv[i]; + if ((fp = fopen(fn, "r")) == 0) + die(1, "fopen(%s): %s", fn, strerror(errno)); + q = strrchr(fn, '.'); tagsz = q ? q - fn : strlen(fn); + + for (;;) { + DRESET(&d); DRESET(&dd); + if (dstr_putline(&d, fp) == EOF) break; + + if (!d.len || d.buf[0] == '*' || d.buf[0] == '[') continue; + + for (p = d.buf, j = 0; j < N(f); j++) { + while (*p == ' ') p++; + if (!*p) break; + f[j].p = p; + if (j == N(f) - 1) { + f[j++].n = d.buf + d.len - p; + break; + } + while (*p && *p != ' ') p++; + if (!*p) break; + f[j].n = p - f[j].p; + } + if (j < N(f)) die(1, "failed to parse input"); + + DPUTM(&dd, f[0].p, f[0].n); DPUTC(&dd, ' '); + DPUTM(&dd, f[2].p, f[2].n); DPUTC(&dd, ' '); + DPUTM(&dd, f[3].p, f[3].n); DPUTC(&dd, ' '); + DPUTM(&dd, f[4].p, f[4].n); DPUTC(&dd, ' '); + DPUTM(&dd, f[5].p, f[5].n); DPUTC(&dd, '|'); + DPUTM(&dd, fn, tagsz); DPUTC(&dd, '|'); + DPUTM(&dd, f[6].p, f[6].n); DPUTC(&dd, '\n'); + fwrite(dd.buf, 1, dd.len, stdout); + } + + if (ferror(fp)) die(1, "ferror(%s)", fn); + fclose(fp); + } + + if (ferror(stdout) || fflush(stdout) || fclose(stdout)) + die(1, "stdout: %s", strerror(errno)); + + return (0); +} diff --git a/dedup-sift.c b/dedup-sift.c new file mode 100644 index 0000000..7e80976 --- /dev/null +++ b/dedup-sift.c @@ -0,0 +1,337 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +struct cand { + size_t name, vol; + struct stat st; + struct cand *nextset, *prevset; + struct cand *next, *rep; +}; + +DA_DECL(cand_v, struct cand); + +/* #define DEBUG */ + +static void picky_link_check(const struct cand *cv, const struct cand *cvl) +{ + const struct cand *p, *q, *r; + + /* Make sure the data structure is set up right. */ + for (p = cv; p < cvl; p++) { + + /* 1. Each candidate should be reachable from its representative. */ + for (q = p->rep; q; q = q->next) + if (q == p) goto found_1; + assert(!"struct-fail 1"); + found_1: + + /* 2. If the candidate is a representative then it should be reachable + * from the root. + */ + if (p == p->rep) { + for (q = cv; q; q = q->nextset) + if (q == p) goto found_2; + assert(!"struct-fail 2"); + found_2:; + } + } + + for (r = 0, p = cv; p; r = p, p = p->nextset) { + + /* 3. Backlinks. */ + assert(p->prevset == r); + + /* 4. All of the candidates in the set should have the same inode number + * and link back to us. + */ + for (q = p; q; q = q->next) { + assert(q->rep == p); + assert(q->st.st_ino == p->st.st_ino); + } + + /* 5. Other sets should have different inode numbers. */ + for (q = cv; q; q = q->nextset) { + if (q == p) continue; + assert(q->st.st_ino != p->st.st_ino); + } + } +} + +#ifdef DEBUG + +static void dump_link_map(const struct cand *cv, const struct cand *cvl) +{ + const struct cand *p; + + for (p = cv; p < cvl; p++) { + printf("%2d: ino=%08lx rep=%ld next=%ld nextset=%ld prevset=%ld\n", + p - cv, p->st.st_ino, p->rep - cv, p->next ? p->next - cv : -1, + p == p->rep && p->nextset ? p->nextset - cv : -1, + p == p->rep && p->prevset ? p->prevset - cv : -1); + } + printf("\n"); + picky_link_check(cv, cvl); +} + +#endif + +#define BUFSZ 16384 + +static void flush_link_candidate_set(struct cand *cv, size_t n, + const char *str) +{ + struct cand *p, *q, *pp, *qq; + const struct cand *cvl = cv + n; + dstr da = DSTR_INIT, db = DSTR_INIT; + int fda = -1, fdb = -1; + ssize_t na, nb; + char bufa[BUFSZ], bufb[BUFSZ]; + + /* If there are no candidates, then back we go. (This happens when we + * read the first line.) + */ + if (!n) return; + + /* Here, we trundle through the candidates and link them into sets sharing + * the same inode number. Candidates within a chain are linked by their + * `next' pointers, and each member (including the representative itself) + * has an `rep' pointer to a set representative, which is the lowest-indexed + * candidate in the set. The set representatives are linked into a + * doubly-linked list by `nextset' and `prevset'. + * + * There are no separate list heads. Instead, we just know that the first + * element is must be a set representative. Handle this element specially + * as a result to initialize the chains. + */ + cv->rep = cv; cv->next = 0; cv->nextset = cv->prevset = 0; + for (p = cv + 1; p < cvl; p++) { + for (q = cv; q < p; q++) { + if (p->st.st_ino == q->st.st_ino) { + p->rep = q; + p->next = q->next; + q->next = p; + goto linked; + } + } + p->rep = p; p->next = 0; + p->nextset = cv->nextset; p->prevset = cv; + if (cv->nextset) cv->nextset->prevset = p; + cv->nextset = p; + linked:; + } +#ifdef DEBUG + dump_link_map(cv, cvl); +#endif + + /* Now see whether we can merge link sets. We can do this if and only if + * they have no volumes in common. + */ + for (p = cv->nextset; p; p = p->nextset) { + for (q = cv; q != p; q = q->nextset) { + for (pp = p; pp; pp = pp->next) { + for (qq = q; qq; qq = qq->next) + if (strcmp(str + pp->vol, str + qq->vol) == 0) goto nextset; + } + + /* We have a potential match. Let's start to get things going. */ + DRESET(&da); dstr_putf(&da, "%s/%s", str + p->vol, str + p->name); + DRESET(&db); dstr_putf(&db, "%s/%s", str + q->vol, str + q->name); + + /* Make sure that our data structure is in a good state. If this is + * wrong then we might have missed a common volume, for example. + */ + picky_link_check(cv, cvl); + + /* Investigate more carefully to make sure this is actually going to + * work. Maybe we didn't record the file status properly. + */ + if (p->st.st_mode != q->st.st_mode || + p->st.st_mtime != q->st.st_mtime || + p->st.st_size != q->st.st_size || + p->st.st_uid != q->st.st_uid || + p->st.st_gid != q->st.st_gid) { + moan("rejecting potential link `%s' <-> `%s': stat mismatch", + da.buf, db.buf); + goto nextset; + } + + /* Check the actual file contents. (This could take a while, but we + * really don't want to screw up.) + */ + if ((fda = open(da.buf, O_RDONLY)) < 0 || + (fdb = open(db.buf, O_RDONLY)) < 0) { + moan("open failed for potential link `%s' <-> `%s': %s", + da.buf, db.buf, strerror(errno)); + goto nextset_close; + } + for (;;) { + na = read(fda, bufa, sizeof(bufa)); + nb = read(fdb, bufb, sizeof(bufb)); + if (na < 0 || nb < 0) { + moan("read failed for potential link `%s' <-> `%s': %s", + da.buf, db.buf, strerror(errno)); + goto nextset_close; + } + if (na != nb) { + moan("rejecting potential link `%s' <-> `%s': length mismatch", + da.buf, db.buf); + goto nextset_close; + } + if (!na) break; + if (memcmp(bufa, bufb, na) != 0) { + moan("rejecting potential link `%s' <-> `%s': content mismatch", + da.buf, db.buf); + goto nextset_close; + } + } + + /* Actually do the work. Well, actually, not yet; just report on it. + */ + printf("save %lu\n", p->st.st_blocks*512); + printf(" -> %s\n", db.buf); + for (pp = p; pp; pp = pp->next) + printf(" <- %s/%s\n", str + pp->vol, str + pp->name); + printf("\n"); + + /* Now actually merge the structures together. */ + for (pp = p;; pp = pp->next) { + pp->st = q->st; + pp->rep = q; + if (!pp->next) { pp->next = q->next; break; } + } + q->next = p; + if (p->nextset) p->nextset->prevset = p->prevset; + if (p->prevset) p->prevset->nextset = p->nextset; + +#ifdef DEBUG + dump_link_map(cv, cvl); +#endif + + nextset_close: + if (fda >= 0) { close(fda); fda = -1; } + if (fdb >= 0) { close(fdb); fdb = -1; } + break; + + nextset:; + } + } + + /* All done. */ + dstr_destroy(&da); + dstr_destroy(&db); +} + +int main(int argc, char *argv[]) +{ + dstr dx = DSTR_INIT, dy = DSTR_INIT, *d = &dx, *dd = &dy, *dt; + dstr dc = DSTR_INIT; + cand_v cv = DA_INIT; + char *n, *t, *k; + size_t ksz, oksz = 0; + int i, x; + size_t len; + + /* Explain who we are. */ + ego(argv[0]); + + /* Work through each input line in turn. */ + for (;;) { + + /* Read the next line. */ + DRESET(d); + if (dstr_putline(d, stdin) == EOF) break; + + /* Split the line into key, tag, and filename fields. */ + k = d->buf; + t = strchr(k, '|'); if (!t) goto parsefail; ksz = t - k; *t++ = 0; + n = strchr(t, '|'); if (!n) goto parsefail; *n++ = 0; + + /* If this key is not the same as the previous one then flush the current + * candidate set and swap buffers (to keep the current key available). + */ + if (ksz != oksz || memcmp(k, dd->buf, ksz) != 0) { + flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf); + dt = d; d = dd; dd = dt; oksz = ksz; + DA_RESET(&cv); + DRESET(&dc); + } + + /* Commit the volume tag and name to the string buffer, and record the + * offsets in a new candidate record. + * + * This involves decoding the filename, which is a little annoying. + */ + DA_ENSURE(&cv, 1); i = DA_LEN(&cv); + DA(&cv)[i].vol = dc.len; DPUTS(&dc, t); DPUTC(&dc, '/'); + DA(&cv)[i].name = dc.len; + while (*n) { + + /* Not a backslash. Gather up a chunk of not-backslash stuff and + * commit it in one go. + */ + if (*n != '\\') { + len = strcspn(n, "\\"); + DPUTM(&dc, n, len); + n += len; + continue; + } + + /* Decode an escaped thing. Only handle the things which aren't + * actually literals. + */ + n++; + switch (*n) { + case 0: goto parsefail; + case 'n': DPUTC(&dc, '\n'); n++; break; + case 'r': DPUTC(&dc, '\r'); n++; break; + case 't': DPUTC(&dc, '\t'); n++; break; + case 'x': + n++; + if ('0' <= *n && *n <= '9') x = *n - '0'; + else if ('A' <= *n && *n <= 'F') x = *n + 10 - 'A'; + else if ('a' <= *n && *n <= 'f') x = *n + 10 - 'a'; + else goto parsefail; + n++; x <<= 4; + if ('0' <= *n && *n <= '9') x |= *n - '0'; + else if ('A' <= *n && *n <= 'F') x |= *n + 10 - 'A'; + else if ('a' <= *n && *n <= 'f') x |= *n + 10 - 'a'; + else goto parsefail; + n++; + DPUTC(&dc, x); + break; + } + } + DPUTC(&dc, 0); + + /* Fetch the inode number for the file, and then replace the `/' with a + * zero. + */ + n = dc.buf + DA(&cv)[i].vol; + if (stat(n, &DA(&cv)[i].st)) die(1, "stat(%s): %s", n, strerror(errno)); + dc.buf[DA(&cv)[i].name - 1] = 0; + + /* Commit the candidate record and continue. */ + DA_EXTEND(&cv, 1); + } + + /* Flush any remaining candidate set. */ + flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf); + return (0); + +parsefail: + die(1, "parse failure"); +} -- 2.11.0