--- /dev/null
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <mLib/dstr.h>
+#include <mLib/darray.h>
+#include <mLib/macros.h>
+#include <mLib/quis.h>
+#include <mLib/report.h>
+
+struct cand {
+ size_t name, vol;
+ struct stat st;
+ struct cand *nextset, *prevset;
+ struct cand *next, *rep;
+};
+
+DA_DECL(cand_v, struct cand);
+
+/* #define DEBUG */
+
+static void picky_link_check(const struct cand *cv, const struct cand *cvl)
+{
+ const struct cand *p, *q, *r;
+
+ /* Make sure the data structure is set up right. */
+ for (p = cv; p < cvl; p++) {
+
+ /* 1. Each candidate should be reachable from its representative. */
+ for (q = p->rep; q; q = q->next)
+ if (q == p) goto found_1;
+ assert(!"struct-fail 1");
+ found_1:
+
+ /* 2. If the candidate is a representative then it should be reachable
+ * from the root.
+ */
+ if (p == p->rep) {
+ for (q = cv; q; q = q->nextset)
+ if (q == p) goto found_2;
+ assert(!"struct-fail 2");
+ found_2:;
+ }
+ }
+
+ for (r = 0, p = cv; p; r = p, p = p->nextset) {
+
+ /* 3. Backlinks. */
+ assert(p->prevset == r);
+
+ /* 4. All of the candidates in the set should have the same inode number
+ * and link back to us.
+ */
+ for (q = p; q; q = q->next) {
+ assert(q->rep == p);
+ assert(q->st.st_ino == p->st.st_ino);
+ }
+
+ /* 5. Other sets should have different inode numbers. */
+ for (q = cv; q; q = q->nextset) {
+ if (q == p) continue;
+ assert(q->st.st_ino != p->st.st_ino);
+ }
+ }
+}
+
+#ifdef DEBUG
+
+static void dump_link_map(const struct cand *cv, const struct cand *cvl)
+{
+ const struct cand *p;
+
+ for (p = cv; p < cvl; p++) {
+ printf("%2d: ino=%08lx rep=%ld next=%ld nextset=%ld prevset=%ld\n",
+ p - cv, p->st.st_ino, p->rep - cv, p->next ? p->next - cv : -1,
+ p == p->rep && p->nextset ? p->nextset - cv : -1,
+ p == p->rep && p->prevset ? p->prevset - cv : -1);
+ }
+ printf("\n");
+ picky_link_check(cv, cvl);
+}
+
+#endif
+
+#define BUFSZ 16384
+
+static void flush_link_candidate_set(struct cand *cv, size_t n,
+ const char *str)
+{
+ struct cand *p, *q, *pp, *qq;
+ const struct cand *cvl = cv + n;
+ dstr da = DSTR_INIT, db = DSTR_INIT;
+ int fda = -1, fdb = -1;
+ ssize_t na, nb;
+ char bufa[BUFSZ], bufb[BUFSZ];
+
+ /* If there are no candidates, then back we go. (This happens when we
+ * read the first line.)
+ */
+ if (!n) return;
+
+ /* Here, we trundle through the candidates and link them into sets sharing
+ * the same inode number. Candidates within a chain are linked by their
+ * `next' pointers, and each member (including the representative itself)
+ * has an `rep' pointer to a set representative, which is the lowest-indexed
+ * candidate in the set. The set representatives are linked into a
+ * doubly-linked list by `nextset' and `prevset'.
+ *
+ * There are no separate list heads. Instead, we just know that the first
+ * element is must be a set representative. Handle this element specially
+ * as a result to initialize the chains.
+ */
+ cv->rep = cv; cv->next = 0; cv->nextset = cv->prevset = 0;
+ for (p = cv + 1; p < cvl; p++) {
+ for (q = cv; q < p; q++) {
+ if (p->st.st_ino == q->st.st_ino) {
+ p->rep = q;
+ p->next = q->next;
+ q->next = p;
+ goto linked;
+ }
+ }
+ p->rep = p; p->next = 0;
+ p->nextset = cv->nextset; p->prevset = cv;
+ if (cv->nextset) cv->nextset->prevset = p;
+ cv->nextset = p;
+ linked:;
+ }
+#ifdef DEBUG
+ dump_link_map(cv, cvl);
+#endif
+
+ /* Now see whether we can merge link sets. We can do this if and only if
+ * they have no volumes in common.
+ */
+ for (p = cv->nextset; p; p = p->nextset) {
+ for (q = cv; q != p; q = q->nextset) {
+ for (pp = p; pp; pp = pp->next) {
+ for (qq = q; qq; qq = qq->next)
+ if (strcmp(str + pp->vol, str + qq->vol) == 0) goto nextset;
+ }
+
+ /* We have a potential match. Let's start to get things going. */
+ DRESET(&da); dstr_putf(&da, "%s/%s", str + p->vol, str + p->name);
+ DRESET(&db); dstr_putf(&db, "%s/%s", str + q->vol, str + q->name);
+
+ /* Make sure that our data structure is in a good state. If this is
+ * wrong then we might have missed a common volume, for example.
+ */
+ picky_link_check(cv, cvl);
+
+ /* Investigate more carefully to make sure this is actually going to
+ * work. Maybe we didn't record the file status properly.
+ */
+ if (p->st.st_mode != q->st.st_mode ||
+ p->st.st_mtime != q->st.st_mtime ||
+ p->st.st_size != q->st.st_size ||
+ p->st.st_uid != q->st.st_uid ||
+ p->st.st_gid != q->st.st_gid) {
+ moan("rejecting potential link `%s' <-> `%s': stat mismatch",
+ da.buf, db.buf);
+ goto nextset;
+ }
+
+ /* Check the actual file contents. (This could take a while, but we
+ * really don't want to screw up.)
+ */
+ if ((fda = open(da.buf, O_RDONLY)) < 0 ||
+ (fdb = open(db.buf, O_RDONLY)) < 0) {
+ moan("open failed for potential link `%s' <-> `%s': %s",
+ da.buf, db.buf, strerror(errno));
+ goto nextset_close;
+ }
+ for (;;) {
+ na = read(fda, bufa, sizeof(bufa));
+ nb = read(fdb, bufb, sizeof(bufb));
+ if (na < 0 || nb < 0) {
+ moan("read failed for potential link `%s' <-> `%s': %s",
+ da.buf, db.buf, strerror(errno));
+ goto nextset_close;
+ }
+ if (na != nb) {
+ moan("rejecting potential link `%s' <-> `%s': length mismatch",
+ da.buf, db.buf);
+ goto nextset_close;
+ }
+ if (!na) break;
+ if (memcmp(bufa, bufb, na) != 0) {
+ moan("rejecting potential link `%s' <-> `%s': content mismatch",
+ da.buf, db.buf);
+ goto nextset_close;
+ }
+ }
+
+ /* Actually do the work. Well, actually, not yet; just report on it.
+ */
+ printf("save %lu\n", p->st.st_blocks*512);
+ printf(" -> %s\n", db.buf);
+ for (pp = p; pp; pp = pp->next)
+ printf(" <- %s/%s\n", str + pp->vol, str + pp->name);
+ printf("\n");
+
+ /* Now actually merge the structures together. */
+ for (pp = p;; pp = pp->next) {
+ pp->st = q->st;
+ pp->rep = q;
+ if (!pp->next) { pp->next = q->next; break; }
+ }
+ q->next = p;
+ if (p->nextset) p->nextset->prevset = p->prevset;
+ if (p->prevset) p->prevset->nextset = p->nextset;
+
+#ifdef DEBUG
+ dump_link_map(cv, cvl);
+#endif
+
+ nextset_close:
+ if (fda >= 0) { close(fda); fda = -1; }
+ if (fdb >= 0) { close(fdb); fdb = -1; }
+ break;
+
+ nextset:;
+ }
+ }
+
+ /* All done. */
+ dstr_destroy(&da);
+ dstr_destroy(&db);
+}
+
+int main(int argc, char *argv[])
+{
+ dstr dx = DSTR_INIT, dy = DSTR_INIT, *d = &dx, *dd = &dy, *dt;
+ dstr dc = DSTR_INIT;
+ cand_v cv = DA_INIT;
+ char *n, *t, *k;
+ size_t ksz, oksz = 0;
+ int i, x;
+ size_t len;
+
+ /* Explain who we are. */
+ ego(argv[0]);
+
+ /* Work through each input line in turn. */
+ for (;;) {
+
+ /* Read the next line. */
+ DRESET(d);
+ if (dstr_putline(d, stdin) == EOF) break;
+
+ /* Split the line into key, tag, and filename fields. */
+ k = d->buf;
+ t = strchr(k, '|'); if (!t) goto parsefail; ksz = t - k; *t++ = 0;
+ n = strchr(t, '|'); if (!n) goto parsefail; *n++ = 0;
+
+ /* If this key is not the same as the previous one then flush the current
+ * candidate set and swap buffers (to keep the current key available).
+ */
+ if (ksz != oksz || memcmp(k, dd->buf, ksz) != 0) {
+ flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf);
+ dt = d; d = dd; dd = dt; oksz = ksz;
+ DA_RESET(&cv);
+ DRESET(&dc);
+ }
+
+ /* Commit the volume tag and name to the string buffer, and record the
+ * offsets in a new candidate record.
+ *
+ * This involves decoding the filename, which is a little annoying.
+ */
+ DA_ENSURE(&cv, 1); i = DA_LEN(&cv);
+ DA(&cv)[i].vol = dc.len; DPUTS(&dc, t); DPUTC(&dc, '/');
+ DA(&cv)[i].name = dc.len;
+ while (*n) {
+
+ /* Not a backslash. Gather up a chunk of not-backslash stuff and
+ * commit it in one go.
+ */
+ if (*n != '\\') {
+ len = strcspn(n, "\\");
+ DPUTM(&dc, n, len);
+ n += len;
+ continue;
+ }
+
+ /* Decode an escaped thing. Only handle the things which aren't
+ * actually literals.
+ */
+ n++;
+ switch (*n) {
+ case 0: goto parsefail;
+ case 'n': DPUTC(&dc, '\n'); n++; break;
+ case 'r': DPUTC(&dc, '\r'); n++; break;
+ case 't': DPUTC(&dc, '\t'); n++; break;
+ case 'x':
+ n++;
+ if ('0' <= *n && *n <= '9') x = *n - '0';
+ else if ('A' <= *n && *n <= 'F') x = *n + 10 - 'A';
+ else if ('a' <= *n && *n <= 'f') x = *n + 10 - 'a';
+ else goto parsefail;
+ n++; x <<= 4;
+ if ('0' <= *n && *n <= '9') x |= *n - '0';
+ else if ('A' <= *n && *n <= 'F') x |= *n + 10 - 'A';
+ else if ('a' <= *n && *n <= 'f') x |= *n + 10 - 'a';
+ else goto parsefail;
+ n++;
+ DPUTC(&dc, x);
+ break;
+ }
+ }
+ DPUTC(&dc, 0);
+
+ /* Fetch the inode number for the file, and then replace the `/' with a
+ * zero.
+ */
+ n = dc.buf + DA(&cv)[i].vol;
+ if (stat(n, &DA(&cv)[i].st)) die(1, "stat(%s): %s", n, strerror(errno));
+ dc.buf[DA(&cv)[i].name - 1] = 0;
+
+ /* Commit the candidate record and continue. */
+ DA_EXTEND(&cv, 1);
+ }
+
+ /* Flush any remaining candidate set. */
+ flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf);
+ return (0);
+
+parsefail:
+ die(1, "parse failure");
+}