Commit | Line | Data |
---|---|---|
cb8be370 MW |
1 | #include <assert.h> |
2 | #include <errno.h> | |
3 | #include <stdio.h> | |
4 | #include <stdlib.h> | |
5 | #include <string.h> | |
6 | ||
7 | #include <sys/types.h> | |
8 | #include <sys/stat.h> | |
9 | #include <unistd.h> | |
10 | #include <fcntl.h> | |
11 | ||
12 | #include <mLib/dstr.h> | |
13 | #include <mLib/darray.h> | |
14 | #include <mLib/macros.h> | |
15 | #include <mLib/quis.h> | |
16 | #include <mLib/report.h> | |
17 | ||
18 | struct cand { | |
19 | size_t name, vol; | |
20 | struct stat st; | |
21 | struct cand *nextset, *prevset; | |
22 | struct cand *next, *rep; | |
23 | }; | |
24 | ||
25 | DA_DECL(cand_v, struct cand); | |
26 | ||
27 | /* #define DEBUG */ | |
28 | ||
29 | static void picky_link_check(const struct cand *cv, const struct cand *cvl) | |
30 | { | |
31 | const struct cand *p, *q, *r; | |
32 | ||
33 | /* Make sure the data structure is set up right. */ | |
34 | for (p = cv; p < cvl; p++) { | |
35 | ||
36 | /* 1. Each candidate should be reachable from its representative. */ | |
37 | for (q = p->rep; q; q = q->next) | |
38 | if (q == p) goto found_1; | |
39 | assert(!"struct-fail 1"); | |
40 | found_1: | |
41 | ||
42 | /* 2. If the candidate is a representative then it should be reachable | |
43 | * from the root. | |
44 | */ | |
45 | if (p == p->rep) { | |
46 | for (q = cv; q; q = q->nextset) | |
47 | if (q == p) goto found_2; | |
48 | assert(!"struct-fail 2"); | |
49 | found_2:; | |
50 | } | |
51 | } | |
52 | ||
53 | for (r = 0, p = cv; p; r = p, p = p->nextset) { | |
54 | ||
55 | /* 3. Backlinks. */ | |
56 | assert(p->prevset == r); | |
57 | ||
58 | /* 4. All of the candidates in the set should have the same inode number | |
59 | * and link back to us. | |
60 | */ | |
61 | for (q = p; q; q = q->next) { | |
62 | assert(q->rep == p); | |
63 | assert(q->st.st_ino == p->st.st_ino); | |
64 | } | |
65 | ||
66 | /* 5. Other sets should have different inode numbers. */ | |
67 | for (q = cv; q; q = q->nextset) { | |
68 | if (q == p) continue; | |
69 | assert(q->st.st_ino != p->st.st_ino); | |
70 | } | |
71 | } | |
72 | } | |
73 | ||
74 | #ifdef DEBUG | |
75 | ||
76 | static void dump_link_map(const struct cand *cv, const struct cand *cvl) | |
77 | { | |
78 | const struct cand *p; | |
79 | ||
80 | for (p = cv; p < cvl; p++) { | |
81 | printf("%2d: ino=%08lx rep=%ld next=%ld nextset=%ld prevset=%ld\n", | |
82 | p - cv, p->st.st_ino, p->rep - cv, p->next ? p->next - cv : -1, | |
83 | p == p->rep && p->nextset ? p->nextset - cv : -1, | |
84 | p == p->rep && p->prevset ? p->prevset - cv : -1); | |
85 | } | |
86 | printf("\n"); | |
87 | picky_link_check(cv, cvl); | |
88 | } | |
89 | ||
90 | #endif | |
91 | ||
92 | #define BUFSZ 16384 | |
93 | ||
94 | static void flush_link_candidate_set(struct cand *cv, size_t n, | |
95 | const char *str) | |
96 | { | |
97 | struct cand *p, *q, *pp, *qq; | |
98 | const struct cand *cvl = cv + n; | |
99 | dstr da = DSTR_INIT, db = DSTR_INIT; | |
100 | int fda = -1, fdb = -1; | |
101 | ssize_t na, nb; | |
102 | char bufa[BUFSZ], bufb[BUFSZ]; | |
103 | ||
104 | /* If there are no candidates, then back we go. (This happens when we | |
105 | * read the first line.) | |
106 | */ | |
107 | if (!n) return; | |
108 | ||
109 | /* Here, we trundle through the candidates and link them into sets sharing | |
110 | * the same inode number. Candidates within a chain are linked by their | |
111 | * `next' pointers, and each member (including the representative itself) | |
112 | * has an `rep' pointer to a set representative, which is the lowest-indexed | |
113 | * candidate in the set. The set representatives are linked into a | |
114 | * doubly-linked list by `nextset' and `prevset'. | |
115 | * | |
116 | * There are no separate list heads. Instead, we just know that the first | |
117 | * element is must be a set representative. Handle this element specially | |
118 | * as a result to initialize the chains. | |
119 | */ | |
120 | cv->rep = cv; cv->next = 0; cv->nextset = cv->prevset = 0; | |
121 | for (p = cv + 1; p < cvl; p++) { | |
122 | for (q = cv; q < p; q++) { | |
123 | if (p->st.st_ino == q->st.st_ino) { | |
124 | p->rep = q; | |
125 | p->next = q->next; | |
126 | q->next = p; | |
127 | goto linked; | |
128 | } | |
129 | } | |
130 | p->rep = p; p->next = 0; | |
131 | p->nextset = cv->nextset; p->prevset = cv; | |
132 | if (cv->nextset) cv->nextset->prevset = p; | |
133 | cv->nextset = p; | |
134 | linked:; | |
135 | } | |
136 | #ifdef DEBUG | |
137 | dump_link_map(cv, cvl); | |
138 | #endif | |
139 | ||
140 | /* Now see whether we can merge link sets. We can do this if and only if | |
141 | * they have no volumes in common. | |
142 | */ | |
143 | for (p = cv->nextset; p; p = p->nextset) { | |
144 | for (q = cv; q != p; q = q->nextset) { | |
145 | for (pp = p; pp; pp = pp->next) { | |
146 | for (qq = q; qq; qq = qq->next) | |
147 | if (strcmp(str + pp->vol, str + qq->vol) == 0) goto nextset; | |
148 | } | |
149 | ||
150 | /* We have a potential match. Let's start to get things going. */ | |
151 | DRESET(&da); dstr_putf(&da, "%s/%s", str + p->vol, str + p->name); | |
152 | DRESET(&db); dstr_putf(&db, "%s/%s", str + q->vol, str + q->name); | |
153 | ||
154 | /* Make sure that our data structure is in a good state. If this is | |
155 | * wrong then we might have missed a common volume, for example. | |
156 | */ | |
157 | picky_link_check(cv, cvl); | |
158 | ||
159 | /* Investigate more carefully to make sure this is actually going to | |
160 | * work. Maybe we didn't record the file status properly. | |
161 | */ | |
162 | if (p->st.st_mode != q->st.st_mode || | |
163 | p->st.st_mtime != q->st.st_mtime || | |
164 | p->st.st_size != q->st.st_size || | |
165 | p->st.st_uid != q->st.st_uid || | |
166 | p->st.st_gid != q->st.st_gid) { | |
167 | moan("rejecting potential link `%s' <-> `%s': stat mismatch", | |
168 | da.buf, db.buf); | |
169 | goto nextset; | |
170 | } | |
171 | ||
172 | /* Check the actual file contents. (This could take a while, but we | |
173 | * really don't want to screw up.) | |
174 | */ | |
175 | if ((fda = open(da.buf, O_RDONLY)) < 0 || | |
176 | (fdb = open(db.buf, O_RDONLY)) < 0) { | |
177 | moan("open failed for potential link `%s' <-> `%s': %s", | |
178 | da.buf, db.buf, strerror(errno)); | |
179 | goto nextset_close; | |
180 | } | |
181 | for (;;) { | |
182 | na = read(fda, bufa, sizeof(bufa)); | |
183 | nb = read(fdb, bufb, sizeof(bufb)); | |
184 | if (na < 0 || nb < 0) { | |
185 | moan("read failed for potential link `%s' <-> `%s': %s", | |
186 | da.buf, db.buf, strerror(errno)); | |
187 | goto nextset_close; | |
188 | } | |
189 | if (na != nb) { | |
190 | moan("rejecting potential link `%s' <-> `%s': length mismatch", | |
191 | da.buf, db.buf); | |
192 | goto nextset_close; | |
193 | } | |
194 | if (!na) break; | |
195 | if (memcmp(bufa, bufb, na) != 0) { | |
196 | moan("rejecting potential link `%s' <-> `%s': content mismatch", | |
197 | da.buf, db.buf); | |
198 | goto nextset_close; | |
199 | } | |
200 | } | |
201 | ||
202 | /* Actually do the work. Well, actually, not yet; just report on it. | |
203 | */ | |
204 | printf("save %lu\n", p->st.st_blocks*512); | |
205 | printf(" -> %s\n", db.buf); | |
206 | for (pp = p; pp; pp = pp->next) | |
207 | printf(" <- %s/%s\n", str + pp->vol, str + pp->name); | |
208 | printf("\n"); | |
209 | ||
210 | /* Now actually merge the structures together. */ | |
211 | for (pp = p;; pp = pp->next) { | |
212 | pp->st = q->st; | |
213 | pp->rep = q; | |
214 | if (!pp->next) { pp->next = q->next; break; } | |
215 | } | |
216 | q->next = p; | |
217 | if (p->nextset) p->nextset->prevset = p->prevset; | |
218 | if (p->prevset) p->prevset->nextset = p->nextset; | |
219 | ||
220 | #ifdef DEBUG | |
221 | dump_link_map(cv, cvl); | |
222 | #endif | |
223 | ||
224 | nextset_close: | |
225 | if (fda >= 0) { close(fda); fda = -1; } | |
226 | if (fdb >= 0) { close(fdb); fdb = -1; } | |
227 | break; | |
228 | ||
229 | nextset:; | |
230 | } | |
231 | } | |
232 | ||
233 | /* All done. */ | |
234 | dstr_destroy(&da); | |
235 | dstr_destroy(&db); | |
236 | } | |
237 | ||
238 | int main(int argc, char *argv[]) | |
239 | { | |
240 | dstr dx = DSTR_INIT, dy = DSTR_INIT, *d = &dx, *dd = &dy, *dt; | |
241 | dstr dc = DSTR_INIT; | |
242 | cand_v cv = DA_INIT; | |
243 | char *n, *t, *k; | |
244 | size_t ksz, oksz = 0; | |
245 | int i, x; | |
246 | size_t len; | |
247 | ||
248 | /* Explain who we are. */ | |
249 | ego(argv[0]); | |
250 | ||
251 | /* Work through each input line in turn. */ | |
252 | for (;;) { | |
253 | ||
254 | /* Read the next line. */ | |
255 | DRESET(d); | |
256 | if (dstr_putline(d, stdin) == EOF) break; | |
257 | ||
258 | /* Split the line into key, tag, and filename fields. */ | |
259 | k = d->buf; | |
260 | t = strchr(k, '|'); if (!t) goto parsefail; ksz = t - k; *t++ = 0; | |
261 | n = strchr(t, '|'); if (!n) goto parsefail; *n++ = 0; | |
262 | ||
263 | /* If this key is not the same as the previous one then flush the current | |
264 | * candidate set and swap buffers (to keep the current key available). | |
265 | */ | |
266 | if (ksz != oksz || memcmp(k, dd->buf, ksz) != 0) { | |
267 | flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf); | |
268 | dt = d; d = dd; dd = dt; oksz = ksz; | |
269 | DA_RESET(&cv); | |
270 | DRESET(&dc); | |
271 | } | |
272 | ||
273 | /* Commit the volume tag and name to the string buffer, and record the | |
274 | * offsets in a new candidate record. | |
275 | * | |
276 | * This involves decoding the filename, which is a little annoying. | |
277 | */ | |
278 | DA_ENSURE(&cv, 1); i = DA_LEN(&cv); | |
279 | DA(&cv)[i].vol = dc.len; DPUTS(&dc, t); DPUTC(&dc, '/'); | |
280 | DA(&cv)[i].name = dc.len; | |
281 | while (*n) { | |
282 | ||
283 | /* Not a backslash. Gather up a chunk of not-backslash stuff and | |
284 | * commit it in one go. | |
285 | */ | |
286 | if (*n != '\\') { | |
287 | len = strcspn(n, "\\"); | |
288 | DPUTM(&dc, n, len); | |
289 | n += len; | |
290 | continue; | |
291 | } | |
292 | ||
293 | /* Decode an escaped thing. Only handle the things which aren't | |
294 | * actually literals. | |
295 | */ | |
296 | n++; | |
297 | switch (*n) { | |
298 | case 0: goto parsefail; | |
299 | case 'n': DPUTC(&dc, '\n'); n++; break; | |
300 | case 'r': DPUTC(&dc, '\r'); n++; break; | |
301 | case 't': DPUTC(&dc, '\t'); n++; break; | |
302 | case 'x': | |
303 | n++; | |
304 | if ('0' <= *n && *n <= '9') x = *n - '0'; | |
305 | else if ('A' <= *n && *n <= 'F') x = *n + 10 - 'A'; | |
306 | else if ('a' <= *n && *n <= 'f') x = *n + 10 - 'a'; | |
307 | else goto parsefail; | |
308 | n++; x <<= 4; | |
309 | if ('0' <= *n && *n <= '9') x |= *n - '0'; | |
310 | else if ('A' <= *n && *n <= 'F') x |= *n + 10 - 'A'; | |
311 | else if ('a' <= *n && *n <= 'f') x |= *n + 10 - 'a'; | |
312 | else goto parsefail; | |
313 | n++; | |
314 | DPUTC(&dc, x); | |
315 | break; | |
316 | } | |
317 | } | |
318 | DPUTC(&dc, 0); | |
319 | ||
320 | /* Fetch the inode number for the file, and then replace the `/' with a | |
321 | * zero. | |
322 | */ | |
323 | n = dc.buf + DA(&cv)[i].vol; | |
324 | if (stat(n, &DA(&cv)[i].st)) die(1, "stat(%s): %s", n, strerror(errno)); | |
325 | dc.buf[DA(&cv)[i].name - 1] = 0; | |
326 | ||
327 | /* Commit the candidate record and continue. */ | |
328 | DA_EXTEND(&cv, 1); | |
329 | } | |
330 | ||
331 | /* Flush any remaining candidate set. */ | |
332 | flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf); | |
333 | return (0); | |
334 | ||
335 | parsefail: | |
336 | die(1, "parse failure"); | |
337 | } |