dedup-{format,sift}.c: Find ways to save space by making hardlinks.
[rsync-backup] / dedup-sift.c
CommitLineData
cb8be370
MW
1#include <assert.h>
2#include <errno.h>
3#include <stdio.h>
4#include <stdlib.h>
5#include <string.h>
6
7#include <sys/types.h>
8#include <sys/stat.h>
9#include <unistd.h>
10#include <fcntl.h>
11
12#include <mLib/dstr.h>
13#include <mLib/darray.h>
14#include <mLib/macros.h>
15#include <mLib/quis.h>
16#include <mLib/report.h>
17
18struct cand {
19 size_t name, vol;
20 struct stat st;
21 struct cand *nextset, *prevset;
22 struct cand *next, *rep;
23};
24
25DA_DECL(cand_v, struct cand);
26
27/* #define DEBUG */
28
29static void picky_link_check(const struct cand *cv, const struct cand *cvl)
30{
31 const struct cand *p, *q, *r;
32
33 /* Make sure the data structure is set up right. */
34 for (p = cv; p < cvl; p++) {
35
36 /* 1. Each candidate should be reachable from its representative. */
37 for (q = p->rep; q; q = q->next)
38 if (q == p) goto found_1;
39 assert(!"struct-fail 1");
40 found_1:
41
42 /* 2. If the candidate is a representative then it should be reachable
43 * from the root.
44 */
45 if (p == p->rep) {
46 for (q = cv; q; q = q->nextset)
47 if (q == p) goto found_2;
48 assert(!"struct-fail 2");
49 found_2:;
50 }
51 }
52
53 for (r = 0, p = cv; p; r = p, p = p->nextset) {
54
55 /* 3. Backlinks. */
56 assert(p->prevset == r);
57
58 /* 4. All of the candidates in the set should have the same inode number
59 * and link back to us.
60 */
61 for (q = p; q; q = q->next) {
62 assert(q->rep == p);
63 assert(q->st.st_ino == p->st.st_ino);
64 }
65
66 /* 5. Other sets should have different inode numbers. */
67 for (q = cv; q; q = q->nextset) {
68 if (q == p) continue;
69 assert(q->st.st_ino != p->st.st_ino);
70 }
71 }
72}
73
74#ifdef DEBUG
75
76static void dump_link_map(const struct cand *cv, const struct cand *cvl)
77{
78 const struct cand *p;
79
80 for (p = cv; p < cvl; p++) {
81 printf("%2d: ino=%08lx rep=%ld next=%ld nextset=%ld prevset=%ld\n",
82 p - cv, p->st.st_ino, p->rep - cv, p->next ? p->next - cv : -1,
83 p == p->rep && p->nextset ? p->nextset - cv : -1,
84 p == p->rep && p->prevset ? p->prevset - cv : -1);
85 }
86 printf("\n");
87 picky_link_check(cv, cvl);
88}
89
90#endif
91
92#define BUFSZ 16384
93
94static void flush_link_candidate_set(struct cand *cv, size_t n,
95 const char *str)
96{
97 struct cand *p, *q, *pp, *qq;
98 const struct cand *cvl = cv + n;
99 dstr da = DSTR_INIT, db = DSTR_INIT;
100 int fda = -1, fdb = -1;
101 ssize_t na, nb;
102 char bufa[BUFSZ], bufb[BUFSZ];
103
104 /* If there are no candidates, then back we go. (This happens when we
105 * read the first line.)
106 */
107 if (!n) return;
108
109 /* Here, we trundle through the candidates and link them into sets sharing
110 * the same inode number. Candidates within a chain are linked by their
111 * `next' pointers, and each member (including the representative itself)
112 * has an `rep' pointer to a set representative, which is the lowest-indexed
113 * candidate in the set. The set representatives are linked into a
114 * doubly-linked list by `nextset' and `prevset'.
115 *
116 * There are no separate list heads. Instead, we just know that the first
117 * element is must be a set representative. Handle this element specially
118 * as a result to initialize the chains.
119 */
120 cv->rep = cv; cv->next = 0; cv->nextset = cv->prevset = 0;
121 for (p = cv + 1; p < cvl; p++) {
122 for (q = cv; q < p; q++) {
123 if (p->st.st_ino == q->st.st_ino) {
124 p->rep = q;
125 p->next = q->next;
126 q->next = p;
127 goto linked;
128 }
129 }
130 p->rep = p; p->next = 0;
131 p->nextset = cv->nextset; p->prevset = cv;
132 if (cv->nextset) cv->nextset->prevset = p;
133 cv->nextset = p;
134 linked:;
135 }
136#ifdef DEBUG
137 dump_link_map(cv, cvl);
138#endif
139
140 /* Now see whether we can merge link sets. We can do this if and only if
141 * they have no volumes in common.
142 */
143 for (p = cv->nextset; p; p = p->nextset) {
144 for (q = cv; q != p; q = q->nextset) {
145 for (pp = p; pp; pp = pp->next) {
146 for (qq = q; qq; qq = qq->next)
147 if (strcmp(str + pp->vol, str + qq->vol) == 0) goto nextset;
148 }
149
150 /* We have a potential match. Let's start to get things going. */
151 DRESET(&da); dstr_putf(&da, "%s/%s", str + p->vol, str + p->name);
152 DRESET(&db); dstr_putf(&db, "%s/%s", str + q->vol, str + q->name);
153
154 /* Make sure that our data structure is in a good state. If this is
155 * wrong then we might have missed a common volume, for example.
156 */
157 picky_link_check(cv, cvl);
158
159 /* Investigate more carefully to make sure this is actually going to
160 * work. Maybe we didn't record the file status properly.
161 */
162 if (p->st.st_mode != q->st.st_mode ||
163 p->st.st_mtime != q->st.st_mtime ||
164 p->st.st_size != q->st.st_size ||
165 p->st.st_uid != q->st.st_uid ||
166 p->st.st_gid != q->st.st_gid) {
167 moan("rejecting potential link `%s' <-> `%s': stat mismatch",
168 da.buf, db.buf);
169 goto nextset;
170 }
171
172 /* Check the actual file contents. (This could take a while, but we
173 * really don't want to screw up.)
174 */
175 if ((fda = open(da.buf, O_RDONLY)) < 0 ||
176 (fdb = open(db.buf, O_RDONLY)) < 0) {
177 moan("open failed for potential link `%s' <-> `%s': %s",
178 da.buf, db.buf, strerror(errno));
179 goto nextset_close;
180 }
181 for (;;) {
182 na = read(fda, bufa, sizeof(bufa));
183 nb = read(fdb, bufb, sizeof(bufb));
184 if (na < 0 || nb < 0) {
185 moan("read failed for potential link `%s' <-> `%s': %s",
186 da.buf, db.buf, strerror(errno));
187 goto nextset_close;
188 }
189 if (na != nb) {
190 moan("rejecting potential link `%s' <-> `%s': length mismatch",
191 da.buf, db.buf);
192 goto nextset_close;
193 }
194 if (!na) break;
195 if (memcmp(bufa, bufb, na) != 0) {
196 moan("rejecting potential link `%s' <-> `%s': content mismatch",
197 da.buf, db.buf);
198 goto nextset_close;
199 }
200 }
201
202 /* Actually do the work. Well, actually, not yet; just report on it.
203 */
204 printf("save %lu\n", p->st.st_blocks*512);
205 printf(" -> %s\n", db.buf);
206 for (pp = p; pp; pp = pp->next)
207 printf(" <- %s/%s\n", str + pp->vol, str + pp->name);
208 printf("\n");
209
210 /* Now actually merge the structures together. */
211 for (pp = p;; pp = pp->next) {
212 pp->st = q->st;
213 pp->rep = q;
214 if (!pp->next) { pp->next = q->next; break; }
215 }
216 q->next = p;
217 if (p->nextset) p->nextset->prevset = p->prevset;
218 if (p->prevset) p->prevset->nextset = p->nextset;
219
220#ifdef DEBUG
221 dump_link_map(cv, cvl);
222#endif
223
224 nextset_close:
225 if (fda >= 0) { close(fda); fda = -1; }
226 if (fdb >= 0) { close(fdb); fdb = -1; }
227 break;
228
229 nextset:;
230 }
231 }
232
233 /* All done. */
234 dstr_destroy(&da);
235 dstr_destroy(&db);
236}
237
238int main(int argc, char *argv[])
239{
240 dstr dx = DSTR_INIT, dy = DSTR_INIT, *d = &dx, *dd = &dy, *dt;
241 dstr dc = DSTR_INIT;
242 cand_v cv = DA_INIT;
243 char *n, *t, *k;
244 size_t ksz, oksz = 0;
245 int i, x;
246 size_t len;
247
248 /* Explain who we are. */
249 ego(argv[0]);
250
251 /* Work through each input line in turn. */
252 for (;;) {
253
254 /* Read the next line. */
255 DRESET(d);
256 if (dstr_putline(d, stdin) == EOF) break;
257
258 /* Split the line into key, tag, and filename fields. */
259 k = d->buf;
260 t = strchr(k, '|'); if (!t) goto parsefail; ksz = t - k; *t++ = 0;
261 n = strchr(t, '|'); if (!n) goto parsefail; *n++ = 0;
262
263 /* If this key is not the same as the previous one then flush the current
264 * candidate set and swap buffers (to keep the current key available).
265 */
266 if (ksz != oksz || memcmp(k, dd->buf, ksz) != 0) {
267 flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf);
268 dt = d; d = dd; dd = dt; oksz = ksz;
269 DA_RESET(&cv);
270 DRESET(&dc);
271 }
272
273 /* Commit the volume tag and name to the string buffer, and record the
274 * offsets in a new candidate record.
275 *
276 * This involves decoding the filename, which is a little annoying.
277 */
278 DA_ENSURE(&cv, 1); i = DA_LEN(&cv);
279 DA(&cv)[i].vol = dc.len; DPUTS(&dc, t); DPUTC(&dc, '/');
280 DA(&cv)[i].name = dc.len;
281 while (*n) {
282
283 /* Not a backslash. Gather up a chunk of not-backslash stuff and
284 * commit it in one go.
285 */
286 if (*n != '\\') {
287 len = strcspn(n, "\\");
288 DPUTM(&dc, n, len);
289 n += len;
290 continue;
291 }
292
293 /* Decode an escaped thing. Only handle the things which aren't
294 * actually literals.
295 */
296 n++;
297 switch (*n) {
298 case 0: goto parsefail;
299 case 'n': DPUTC(&dc, '\n'); n++; break;
300 case 'r': DPUTC(&dc, '\r'); n++; break;
301 case 't': DPUTC(&dc, '\t'); n++; break;
302 case 'x':
303 n++;
304 if ('0' <= *n && *n <= '9') x = *n - '0';
305 else if ('A' <= *n && *n <= 'F') x = *n + 10 - 'A';
306 else if ('a' <= *n && *n <= 'f') x = *n + 10 - 'a';
307 else goto parsefail;
308 n++; x <<= 4;
309 if ('0' <= *n && *n <= '9') x |= *n - '0';
310 else if ('A' <= *n && *n <= 'F') x |= *n + 10 - 'A';
311 else if ('a' <= *n && *n <= 'f') x |= *n + 10 - 'a';
312 else goto parsefail;
313 n++;
314 DPUTC(&dc, x);
315 break;
316 }
317 }
318 DPUTC(&dc, 0);
319
320 /* Fetch the inode number for the file, and then replace the `/' with a
321 * zero.
322 */
323 n = dc.buf + DA(&cv)[i].vol;
324 if (stat(n, &DA(&cv)[i].st)) die(1, "stat(%s): %s", n, strerror(errno));
325 dc.buf[DA(&cv)[i].name - 1] = 0;
326
327 /* Commit the candidate record and continue. */
328 DA_EXTEND(&cv, 1);
329 }
330
331 /* Flush any remaining candidate set. */
332 flush_link_candidate_set(DA(&cv), DA_LEN(&cv), dc.buf);
333 return (0);
334
335parsefail:
336 die(1, "parse failure");
337}