X-Git-Url: https://git.distorted.org.uk/~mdw/rsync-backup/blobdiff_plain/e0de3610917976a1cade67f4c4a9975144bdf628..cb8be37013a6aefa145183fa3948e3062317203c:/dedup-sift.c?ds=sidebyside diff --git a/dedup-sift.c b/dedup-sift.c new file mode 100644 index 0000000..7e80976 --- /dev/null +++ b/dedup-sift.c @@ -0,0 +1,337 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +struct cand { + size_t name, vol; + struct stat st; + struct cand *nextset, *prevset; + struct cand *next, *rep; +}; + +DA_DECL(cand_v, struct cand); + +/* #define DEBUG */ + +static void picky_link_check(const struct cand *cv, const struct cand *cvl) +{ + const struct cand *p, *q, *r; + + /* Make sure the data structure is set up right. */ + for (p = cv; p < cvl; p++) { + + /* 1. Each candidate should be reachable from its representative. */ + for (q = p->rep; q; q = q->next) + if (q == p) goto found_1; + assert(!"struct-fail 1"); + found_1: + + /* 2. If the candidate is a representative then it should be reachable + * from the root. + */ + if (p == p->rep) { + for (q = cv; q; q = q->nextset) + if (q == p) goto found_2; + assert(!"struct-fail 2"); + found_2:; + } + } + + for (r = 0, p = cv; p; r = p, p = p->nextset) { + + /* 3. Backlinks. */ + assert(p->prevset == r); + + /* 4. All of the candidates in the set should have the same inode number + * and link back to us. + */ + for (q = p; q; q = q->next) { + assert(q->rep == p); + assert(q->st.st_ino == p->st.st_ino); + } + + /* 5. Other sets should have different inode numbers. */ + for (q = cv; q; q = q->nextset) { + if (q == p) continue; + assert(q->st.st_ino != p->st.st_ino); + } + } +} + +#ifdef DEBUG + +static void dump_link_map(const struct cand *cv, const struct cand *cvl) +{ + const struct cand *p; + + for (p = cv; p < cvl; p++) { + printf("%2d: ino=%08lx rep=%ld next=%ld nextset=%ld prevset=%ld\n", + p - cv, p->st.st_ino, p->rep - cv, p->next ? p->next - cv : -1, + p == p->rep && p->nextset ? p->nextset - cv : -1, + p == p->rep && p->prevset ? p->prevset - cv : -1); + } + printf("\n"); + picky_link_check(cv, cvl); +} + +#endif + +#define BUFSZ 16384 + +static void flush_link_candidate_set(struct cand *cv, size_t n, + const char *str) +{ + struct cand *p, *q, *pp, *qq; + const struct cand *cvl = cv + n; + dstr da = DSTR_INIT, db = DSTR_INIT; + int fda = -1, fdb = -1; + ssize_t na, nb; + char bufa[BUFSZ], bufb[BUFSZ]; + + /* If there are no candidates, then back we go. (This happens when we + * read the first line.) + */ + if (!n) return; + + /* Here, we trundle through the candidates and link them into sets sharing + * the same inode number. Candidates within a chain are linked by their + * `next' pointers, and each member (including the representative itself) + * has an `rep' pointer to a set representative, which is the lowest-indexed + * candidate in the set. The set representatives are linked into a + * doubly-linked list by `nextset' and `prevset'. + * + * There are no separate list heads. Instead, we just know that the first + * element is must be a set representative. Handle this element specially + * as a result to initialize the chains. + */ + cv->rep = cv; cv->next = 0; cv->nextset = cv->prevset = 0; + for (p = cv + 1; p < cvl; p++) { + for (q = cv; q < p; q++) { + if (p->st.st_ino == q->st.st_ino) { + p->rep = q; + p->next = q->next; + q->next = p; + goto linked; + } + } + p->rep = p; p->next = 0; + p->nextset = cv->nextset; p->prevset = cv; + if (cv->nextset) cv->nextset->prevset = p; + cv->nextset = p; + linked:; + } +#ifdef DEBUG + dump_link_map(cv, cvl); +#endif + + /* Now see whether we can merge link sets. We can do this if and only if + * they have no volumes in common. + */ + for (p = cv->nextset; p; p = p->nextset) { + for (q = cv; q != p; q = q->nextset) { + for (pp = p; pp; pp = pp->next) { + for (qq = q; qq; qq = qq->next) + if (strcmp(str + pp->vol, str + qq->vol) == 0) goto nextset; + } + + /* We have a potential match. Let's start to get things going. */ + DRESET(&da); dstr_putf(&da, "%s/%s", str + p->vol, str + p->name); + DRESET(&db); dstr_putf(&db, "%s/%s", str + q->vol, str + q->name); + + /* Make sure that our data structure is in a good state. If this is + * wrong then we might have missed a common volume, for example. + */ + picky_link_check(cv, cvl); + + /* Investigate more carefully to make sure this is actually going to + * work. Maybe we didn't record the file status properly. + */ + if (p->st.st_mode != q->st.st_mode || + p->st.st_mtime != q->st.st_mtime || + p->st.st_size != q->st.st_size || + p->st.st_uid != q->st.st_uid || + p->st.st_gid != q->st.st_gid) { + moan("rejecting potential link `%s' <-> `%s': stat mismatch", + da.buf, db.buf); + goto nextset; + } + + /* Check the actual file contents. (This could take a while, but we + * really don't want to screw up.) + */ + if ((fda = open(da.buf, O_RDONLY)) < 0 || + (fdb = open(db.buf, O_RDONLY)) < 0) { + moan("open failed for potential link `%s' <-> `%s': %s", + da.buf, db.buf, strerror(errno)); + goto nextset_close; + } + for (;;) { + na = read(fda, bufa, sizeof(bufa)); + nb = read(fdb, bufb, sizeof(bufb)); + if (na < 0 || nb < 0) { + moan("read failed for potential link `%s' <-> `%s': %s", + da.buf, db.buf, strerror(errno)); + goto nextset_close; + } + if (na != nb) { + moan("rejecting potential link `%s' <-> `%s': length mismatch", + da.buf, db.buf); + goto nextset_close; + } + if (!na) break; + if (memcmp(bufa, bufb, na) != 0) { + moan("rejecting potential link `%s' <-> `%s': content mismatch", + da.buf, db.buf); + goto nextset_close; + } + } + + /* Actually do the work. Well, actually, not yet; just report on it. + */ + printf("save %lu\n", p->st.st_blocks*512); + printf(" -> %s\n", db.buf); + for (pp = p; pp; pp = pp->next) + printf(" <- %s/%s\n", str + pp->vol, str + pp->name); + printf("\n"); + + /* Now actually merge the structures together. */ + for (pp = p;; pp = pp->next) { + pp->st = q->st; + pp->rep = q; + if (!pp->next) { pp->next = q->next; break; } + } + q->next = p; + if (p->nextset) p->nextset->prevset = p->prevset; + if (p->prevset) p->prevset->nextset = p->nextset; + +#ifdef DEBUG + dump_link_map(cv, cvl); +#endif + + nextset_close: + if (fda >= 0) { close(fda); fda = -1; } + if (fdb >= 0) { close(fdb); fdb = -1; } + break; + + nextset:; + } + } + + /* All done. */ + dstr_destroy(&da); + dstr_destroy(&db); +} + +int main(int argc, char *argv[]) +{ + dstr dx = DSTR_INIT, dy = DSTR_INIT, *d = &dx, *dd = &dy, *dt; + dstr dc = DSTR_INIT; + cand_v cv = DA_INIT; + char *n, *t, *k; + size_t ksz, oksz = 0; + int i, x; + size_t len; + + /* Explain who we are. */ + ego(argv[0]); + + /* Work through each input line in turn. */ + for (;;) { + + /* Read the next line. */ + DRESET(d); + if (dstr_putline(d, stdin) == EOF) break; + + /* Split the line into key, tag, and filename fields. */ + k = d->buf; + t = strchr(k, '|'); if (!t) goto parsefail; ksz = t - k; *t++ = 0; + n = strchr(t, '|'); if (!n) goto parsefail; *n++ = 0; + + /* If this key is not the same as the previous one then flush the current + * candidate set and swap buffers (to keep the current key available). + */ + if (ksz != oksz || memcmp(k, dd->buf, ksz) != 0) { + flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf); + dt = d; d = dd; dd = dt; oksz = ksz; + DA_RESET(&cv); + DRESET(&dc); + } + + /* Commit the volume tag and name to the string buffer, and record the + * offsets in a new candidate record. + * + * This involves decoding the filename, which is a little annoying. + */ + DA_ENSURE(&cv, 1); i = DA_LEN(&cv); + DA(&cv)[i].vol = dc.len; DPUTS(&dc, t); DPUTC(&dc, '/'); + DA(&cv)[i].name = dc.len; + while (*n) { + + /* Not a backslash. Gather up a chunk of not-backslash stuff and + * commit it in one go. + */ + if (*n != '\\') { + len = strcspn(n, "\\"); + DPUTM(&dc, n, len); + n += len; + continue; + } + + /* Decode an escaped thing. Only handle the things which aren't + * actually literals. + */ + n++; + switch (*n) { + case 0: goto parsefail; + case 'n': DPUTC(&dc, '\n'); n++; break; + case 'r': DPUTC(&dc, '\r'); n++; break; + case 't': DPUTC(&dc, '\t'); n++; break; + case 'x': + n++; + if ('0' <= *n && *n <= '9') x = *n - '0'; + else if ('A' <= *n && *n <= 'F') x = *n + 10 - 'A'; + else if ('a' <= *n && *n <= 'f') x = *n + 10 - 'a'; + else goto parsefail; + n++; x <<= 4; + if ('0' <= *n && *n <= '9') x |= *n - '0'; + else if ('A' <= *n && *n <= 'F') x |= *n + 10 - 'A'; + else if ('a' <= *n && *n <= 'f') x |= *n + 10 - 'a'; + else goto parsefail; + n++; + DPUTC(&dc, x); + break; + } + } + DPUTC(&dc, 0); + + /* Fetch the inode number for the file, and then replace the `/' with a + * zero. + */ + n = dc.buf + DA(&cv)[i].vol; + if (stat(n, &DA(&cv)[i].st)) die(1, "stat(%s): %s", n, strerror(errno)); + dc.buf[DA(&cv)[i].name - 1] = 0; + + /* Commit the candidate record and continue. */ + DA_EXTEND(&cv, 1); + } + + /* Flush any remaining candidate set. */ + flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf); + return (0); + +parsefail: + die(1, "parse failure"); +}