| 1 | #include <assert.h> |
| 2 | #include <errno.h> |
| 3 | #include <stdio.h> |
| 4 | #include <stdlib.h> |
| 5 | #include <string.h> |
| 6 | |
| 7 | #include <sys/types.h> |
| 8 | #include <sys/stat.h> |
| 9 | #include <unistd.h> |
| 10 | #include <fcntl.h> |
| 11 | |
| 12 | #include <mLib/dstr.h> |
| 13 | #include <mLib/darray.h> |
| 14 | #include <mLib/macros.h> |
| 15 | #include <mLib/quis.h> |
| 16 | #include <mLib/report.h> |
| 17 | |
| 18 | struct cand { |
| 19 | size_t name, vol; |
| 20 | struct stat st; |
| 21 | struct cand *nextset, *prevset; |
| 22 | struct cand *next, *rep; |
| 23 | }; |
| 24 | |
| 25 | DA_DECL(cand_v, struct cand); |
| 26 | |
| 27 | /* #define DEBUG */ |
| 28 | |
| 29 | static void picky_link_check(const struct cand *cv, const struct cand *cvl) |
| 30 | { |
| 31 | const struct cand *p, *q, *r; |
| 32 | |
| 33 | /* Make sure the data structure is set up right. */ |
| 34 | for (p = cv; p < cvl; p++) { |
| 35 | |
| 36 | /* 1. Each candidate should be reachable from its representative. */ |
| 37 | for (q = p->rep; q; q = q->next) |
| 38 | if (q == p) goto found_1; |
| 39 | assert(!"struct-fail 1"); |
| 40 | found_1: |
| 41 | |
| 42 | /* 2. If the candidate is a representative then it should be reachable |
| 43 | * from the root. |
| 44 | */ |
| 45 | if (p == p->rep) { |
| 46 | for (q = cv; q; q = q->nextset) |
| 47 | if (q == p) goto found_2; |
| 48 | assert(!"struct-fail 2"); |
| 49 | found_2:; |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | for (r = 0, p = cv; p; r = p, p = p->nextset) { |
| 54 | |
| 55 | /* 3. Backlinks. */ |
| 56 | assert(p->prevset == r); |
| 57 | |
| 58 | /* 4. All of the candidates in the set should have the same inode number |
| 59 | * and link back to us. |
| 60 | */ |
| 61 | for (q = p; q; q = q->next) { |
| 62 | assert(q->rep == p); |
| 63 | assert(q->st.st_ino == p->st.st_ino); |
| 64 | } |
| 65 | |
| 66 | /* 5. Other sets should have different inode numbers. */ |
| 67 | for (q = cv; q; q = q->nextset) { |
| 68 | if (q == p) continue; |
| 69 | assert(q->st.st_ino != p->st.st_ino); |
| 70 | } |
| 71 | } |
| 72 | } |
| 73 | |
| 74 | #ifdef DEBUG |
| 75 | |
| 76 | static void dump_link_map(const struct cand *cv, const struct cand *cvl) |
| 77 | { |
| 78 | const struct cand *p; |
| 79 | |
| 80 | for (p = cv; p < cvl; p++) { |
| 81 | printf("%2d: ino=%08lx rep=%ld next=%ld nextset=%ld prevset=%ld\n", |
| 82 | p - cv, p->st.st_ino, p->rep - cv, p->next ? p->next - cv : -1, |
| 83 | p == p->rep && p->nextset ? p->nextset - cv : -1, |
| 84 | p == p->rep && p->prevset ? p->prevset - cv : -1); |
| 85 | } |
| 86 | printf("\n"); |
| 87 | picky_link_check(cv, cvl); |
| 88 | } |
| 89 | |
| 90 | #endif |
| 91 | |
| 92 | #define BUFSZ 16384 |
| 93 | |
| 94 | static void flush_link_candidate_set(struct cand *cv, size_t n, |
| 95 | const char *str) |
| 96 | { |
| 97 | struct cand *p, *q, *pp, *qq; |
| 98 | const struct cand *cvl = cv + n; |
| 99 | dstr da = DSTR_INIT, db = DSTR_INIT; |
| 100 | int fda = -1, fdb = -1; |
| 101 | ssize_t na, nb; |
| 102 | char bufa[BUFSZ], bufb[BUFSZ]; |
| 103 | |
| 104 | /* If there are no candidates, then back we go. (This happens when we |
| 105 | * read the first line.) |
| 106 | */ |
| 107 | if (!n) return; |
| 108 | |
| 109 | /* Here, we trundle through the candidates and link them into sets sharing |
| 110 | * the same inode number. Candidates within a chain are linked by their |
| 111 | * `next' pointers, and each member (including the representative itself) |
| 112 | * has an `rep' pointer to a set representative, which is the lowest-indexed |
| 113 | * candidate in the set. The set representatives are linked into a |
| 114 | * doubly-linked list by `nextset' and `prevset'. |
| 115 | * |
| 116 | * There are no separate list heads. Instead, we just know that the first |
| 117 | * element is must be a set representative. Handle this element specially |
| 118 | * as a result to initialize the chains. |
| 119 | */ |
| 120 | cv->rep = cv; cv->next = 0; cv->nextset = cv->prevset = 0; |
| 121 | for (p = cv + 1; p < cvl; p++) { |
| 122 | for (q = cv; q < p; q++) { |
| 123 | if (p->st.st_ino == q->st.st_ino) { |
| 124 | p->rep = q; |
| 125 | p->next = q->next; |
| 126 | q->next = p; |
| 127 | goto linked; |
| 128 | } |
| 129 | } |
| 130 | p->rep = p; p->next = 0; |
| 131 | p->nextset = cv->nextset; p->prevset = cv; |
| 132 | if (cv->nextset) cv->nextset->prevset = p; |
| 133 | cv->nextset = p; |
| 134 | linked:; |
| 135 | } |
| 136 | #ifdef DEBUG |
| 137 | dump_link_map(cv, cvl); |
| 138 | #endif |
| 139 | |
| 140 | /* Now see whether we can merge link sets. We can do this if and only if |
| 141 | * they have no volumes in common. |
| 142 | */ |
| 143 | for (p = cv->nextset; p; p = p->nextset) { |
| 144 | for (q = cv; q != p; q = q->nextset) { |
| 145 | for (pp = p; pp; pp = pp->next) { |
| 146 | for (qq = q; qq; qq = qq->next) |
| 147 | if (strcmp(str + pp->vol, str + qq->vol) == 0) goto nextset; |
| 148 | } |
| 149 | |
| 150 | /* We have a potential match. Let's start to get things going. */ |
| 151 | DRESET(&da); dstr_putf(&da, "%s/%s", str + p->vol, str + p->name); |
| 152 | DRESET(&db); dstr_putf(&db, "%s/%s", str + q->vol, str + q->name); |
| 153 | |
| 154 | /* Make sure that our data structure is in a good state. If this is |
| 155 | * wrong then we might have missed a common volume, for example. |
| 156 | */ |
| 157 | picky_link_check(cv, cvl); |
| 158 | |
| 159 | /* Investigate more carefully to make sure this is actually going to |
| 160 | * work. Maybe we didn't record the file status properly. |
| 161 | */ |
| 162 | if (p->st.st_mode != q->st.st_mode || |
| 163 | p->st.st_mtime != q->st.st_mtime || |
| 164 | p->st.st_size != q->st.st_size || |
| 165 | p->st.st_uid != q->st.st_uid || |
| 166 | p->st.st_gid != q->st.st_gid) { |
| 167 | moan("rejecting potential link `%s' <-> `%s': stat mismatch", |
| 168 | da.buf, db.buf); |
| 169 | goto nextset; |
| 170 | } |
| 171 | |
| 172 | /* Check the actual file contents. (This could take a while, but we |
| 173 | * really don't want to screw up.) |
| 174 | */ |
| 175 | if ((fda = open(da.buf, O_RDONLY)) < 0 || |
| 176 | (fdb = open(db.buf, O_RDONLY)) < 0) { |
| 177 | moan("open failed for potential link `%s' <-> `%s': %s", |
| 178 | da.buf, db.buf, strerror(errno)); |
| 179 | goto nextset_close; |
| 180 | } |
| 181 | for (;;) { |
| 182 | na = read(fda, bufa, sizeof(bufa)); |
| 183 | nb = read(fdb, bufb, sizeof(bufb)); |
| 184 | if (na < 0 || nb < 0) { |
| 185 | moan("read failed for potential link `%s' <-> `%s': %s", |
| 186 | da.buf, db.buf, strerror(errno)); |
| 187 | goto nextset_close; |
| 188 | } |
| 189 | if (na != nb) { |
| 190 | moan("rejecting potential link `%s' <-> `%s': length mismatch", |
| 191 | da.buf, db.buf); |
| 192 | goto nextset_close; |
| 193 | } |
| 194 | if (!na) break; |
| 195 | if (memcmp(bufa, bufb, na) != 0) { |
| 196 | moan("rejecting potential link `%s' <-> `%s': content mismatch", |
| 197 | da.buf, db.buf); |
| 198 | goto nextset_close; |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | /* Actually do the work. Well, actually, not yet; just report on it. |
| 203 | */ |
| 204 | printf("save %lu\n", p->st.st_blocks*512); |
| 205 | printf(" -> %s\n", db.buf); |
| 206 | for (pp = p; pp; pp = pp->next) |
| 207 | printf(" <- %s/%s\n", str + pp->vol, str + pp->name); |
| 208 | printf("\n"); |
| 209 | |
| 210 | /* Now actually merge the structures together. */ |
| 211 | for (pp = p;; pp = pp->next) { |
| 212 | pp->st = q->st; |
| 213 | pp->rep = q; |
| 214 | if (!pp->next) { pp->next = q->next; break; } |
| 215 | } |
| 216 | q->next = p; |
| 217 | if (p->nextset) p->nextset->prevset = p->prevset; |
| 218 | if (p->prevset) p->prevset->nextset = p->nextset; |
| 219 | |
| 220 | #ifdef DEBUG |
| 221 | dump_link_map(cv, cvl); |
| 222 | #endif |
| 223 | |
| 224 | nextset_close: |
| 225 | if (fda >= 0) { close(fda); fda = -1; } |
| 226 | if (fdb >= 0) { close(fdb); fdb = -1; } |
| 227 | break; |
| 228 | |
| 229 | nextset:; |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | /* All done. */ |
| 234 | dstr_destroy(&da); |
| 235 | dstr_destroy(&db); |
| 236 | } |
| 237 | |
| 238 | int main(int argc, char *argv[]) |
| 239 | { |
| 240 | dstr dx = DSTR_INIT, dy = DSTR_INIT, *d = &dx, *dd = &dy, *dt; |
| 241 | dstr dc = DSTR_INIT; |
| 242 | cand_v cv = DA_INIT; |
| 243 | char *n, *t, *k; |
| 244 | size_t ksz, oksz = 0; |
| 245 | int i, x; |
| 246 | size_t len; |
| 247 | |
| 248 | /* Explain who we are. */ |
| 249 | ego(argv[0]); |
| 250 | |
| 251 | /* Work through each input line in turn. */ |
| 252 | for (;;) { |
| 253 | |
| 254 | /* Read the next line. */ |
| 255 | DRESET(d); |
| 256 | if (dstr_putline(d, stdin) == EOF) break; |
| 257 | |
| 258 | /* Split the line into key, tag, and filename fields. */ |
| 259 | k = d->buf; |
| 260 | t = strchr(k, '|'); if (!t) goto parsefail; ksz = t - k; *t++ = 0; |
| 261 | n = strchr(t, '|'); if (!n) goto parsefail; *n++ = 0; |
| 262 | |
| 263 | /* If this key is not the same as the previous one then flush the current |
| 264 | * candidate set and swap buffers (to keep the current key available). |
| 265 | */ |
| 266 | if (ksz != oksz || memcmp(k, dd->buf, ksz) != 0) { |
| 267 | flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf); |
| 268 | dt = d; d = dd; dd = dt; oksz = ksz; |
| 269 | DA_RESET(&cv); |
| 270 | DRESET(&dc); |
| 271 | } |
| 272 | |
| 273 | /* Commit the volume tag and name to the string buffer, and record the |
| 274 | * offsets in a new candidate record. |
| 275 | * |
| 276 | * This involves decoding the filename, which is a little annoying. |
| 277 | */ |
| 278 | DA_ENSURE(&cv, 1); i = DA_LEN(&cv); |
| 279 | DA(&cv)[i].vol = dc.len; DPUTS(&dc, t); DPUTC(&dc, '/'); |
| 280 | DA(&cv)[i].name = dc.len; |
| 281 | while (*n) { |
| 282 | |
| 283 | /* Not a backslash. Gather up a chunk of not-backslash stuff and |
| 284 | * commit it in one go. |
| 285 | */ |
| 286 | if (*n != '\\') { |
| 287 | len = strcspn(n, "\\"); |
| 288 | DPUTM(&dc, n, len); |
| 289 | n += len; |
| 290 | continue; |
| 291 | } |
| 292 | |
| 293 | /* Decode an escaped thing. Only handle the things which aren't |
| 294 | * actually literals. |
| 295 | */ |
| 296 | n++; |
| 297 | switch (*n) { |
| 298 | case 0: goto parsefail; |
| 299 | case 'n': DPUTC(&dc, '\n'); n++; break; |
| 300 | case 'r': DPUTC(&dc, '\r'); n++; break; |
| 301 | case 't': DPUTC(&dc, '\t'); n++; break; |
| 302 | case 'x': |
| 303 | n++; |
| 304 | if ('0' <= *n && *n <= '9') x = *n - '0'; |
| 305 | else if ('A' <= *n && *n <= 'F') x = *n + 10 - 'A'; |
| 306 | else if ('a' <= *n && *n <= 'f') x = *n + 10 - 'a'; |
| 307 | else goto parsefail; |
| 308 | n++; x <<= 4; |
| 309 | if ('0' <= *n && *n <= '9') x |= *n - '0'; |
| 310 | else if ('A' <= *n && *n <= 'F') x |= *n + 10 - 'A'; |
| 311 | else if ('a' <= *n && *n <= 'f') x |= *n + 10 - 'a'; |
| 312 | else goto parsefail; |
| 313 | n++; |
| 314 | DPUTC(&dc, x); |
| 315 | break; |
| 316 | } |
| 317 | } |
| 318 | DPUTC(&dc, 0); |
| 319 | |
| 320 | /* Fetch the inode number for the file, and then replace the `/' with a |
| 321 | * zero. |
| 322 | */ |
| 323 | n = dc.buf + DA(&cv)[i].vol; |
| 324 | if (stat(n, &DA(&cv)[i].st)) die(1, "stat(%s): %s", n, strerror(errno)); |
| 325 | dc.buf[DA(&cv)[i].name - 1] = 0; |
| 326 | |
| 327 | /* Commit the candidate record and continue. */ |
| 328 | DA_EXTEND(&cv, 1); |
| 329 | } |
| 330 | |
| 331 | /* Flush any remaining candidate set. */ |
| 332 | flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf); |
| 333 | return (0); |
| 334 | |
| 335 | parsefail: |
| 336 | die(1, "parse failure"); |
| 337 | } |