Commit | Line | Data |
---|---|---|
f6b4ffdc MW |
1 | #! @PYTHON@ |
2 | ### | |
3 | ### Efficiently construct canonical digests of filesystems | |
4 | ### | |
5 | ### (c) 2012 Mark Wooding | |
6 | ### | |
7 | ||
8 | ###----- Licensing notice --------------------------------------------------- | |
9 | ### | |
10 | ### This file is part of the `rsync-backup' program. | |
11 | ### | |
12 | ### rsync-backup is free software; you can redistribute it and/or modify | |
13 | ### it under the terms of the GNU General Public License as published by | |
14 | ### the Free Software Foundation; either version 2 of the License, or | |
15 | ### (at your option) any later version. | |
16 | ### | |
17 | ### rsync-backup is distributed in the hope that it will be useful, | |
18 | ### but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ### GNU General Public License for more details. | |
21 | ### | |
22 | ### You should have received a copy of the GNU General Public License | |
23 | ### along with rsync-backup; if not, write to the Free Software Foundation, | |
24 | ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
25 | ||
26 | from sys import argv, exit, stdin, stdout, stderr | |
27 | import os as OS | |
28 | import re as RX | |
29 | import time as T | |
30 | import stat as ST | |
31 | import optparse as OP | |
32 | import hashlib as H | |
33 | import sqlite3 as DB | |
34 | import zlib as Z | |
35 | ||
36 | PACKAGE = '@PACKAGE@' | |
37 | VERSION = '@VERSION@' | |
38 | ||
39 | ###-------------------------------------------------------------------------- | |
40 | ### Utilities. | |
41 | ||
42 | QUIS = OS.path.basename(argv[0]) | |
43 | ||
44 | def moan(msg): | |
45 | stderr.write('%s: %s\n' % (QUIS, msg)) | |
46 | ||
47 | def die(msg, rc = 1): | |
48 | moan(msg) | |
49 | exit(rc) | |
50 | ||
51 | SYSERR = 0 | |
52 | def syserr(msg): | |
53 | global SYSERR | |
54 | moan(msg) | |
55 | SYSERR += 1 | |
56 | ||
57 | ###-------------------------------------------------------------------------- | |
58 | ### File system enumeration. | |
59 | ||
60 | class FileInfo (object): | |
61 | def __init__(me, file, st = None): | |
62 | me.name = file | |
63 | if st: | |
64 | me.st = st | |
65 | me.err = None | |
66 | else: | |
67 | try: | |
68 | me.st = OS.lstat(file) | |
69 | me.err = None | |
70 | except OSError, err: | |
71 | me.st = None | |
72 | me.err = err | |
73 | ||
74 | def enum_walk(file, func): | |
75 | ||
76 | def dirents(name): | |
77 | try: | |
78 | return OS.listdir(name) | |
79 | except OSError, err: | |
80 | syserr("failed to read directory `%s': %s" % (name, err.strerror)) | |
81 | return [] | |
82 | ||
83 | def dir(ee, dev): | |
84 | ff = [] | |
85 | dd = [] | |
86 | for e in ee: | |
87 | fi = FileInfo(e) | |
88 | if fi.st and fi.st.st_dev != dev: pass | |
89 | if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi) | |
90 | else: ff.append(fi) | |
91 | ff.sort(key = lambda fi: fi.name) | |
92 | dd.sort(key = lambda fi: fi.name + '/') | |
93 | for f in ff: | |
94 | func(f) | |
95 | for d in dd: | |
96 | if d.st.st_dev == dev: | |
97 | func(d) | |
98 | dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev) | |
99 | ||
100 | if file.endswith('/'): | |
ad7e8534 MW |
101 | cwd = OS.open('.', OS.O_RDONLY) |
102 | try: | |
103 | OS.chdir(file) | |
104 | fi = FileInfo('.') | |
105 | func(fi) | |
106 | dir(dirents('.'), fi.st.st_dev) | |
107 | finally: | |
108 | OS.fchdir(cwd) | |
109 | OS.close(cwd) | |
f6b4ffdc MW |
110 | else: |
111 | fi = FileInfo(file) | |
112 | func(fi) | |
113 | if fi.st and ST.S_ISDIR(fi.st.st_mode): | |
114 | dir([OS.path.join(fi.name, e) for e in dirents(fi.name)], | |
115 | fi.st.st_dev) | |
116 | ||
117 | def enum_find0(f, func): | |
118 | tail = "" | |
119 | while True: | |
120 | buf = f.read(8192) | |
121 | last = len(buf) == 0 | |
122 | names = (tail + buf).split('\0') | |
123 | tail = names.pop() | |
124 | for n in names: | |
125 | func(FileInfo(n)) | |
126 | if last: | |
127 | break | |
128 | if len(tail): | |
129 | moan("ignored trailing junk after last filename") | |
130 | ||
131 | RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE) | |
132 | def enum_rsync(f, func): | |
133 | ||
134 | ## The format is a little fiddly. Each line consists of PERMS SIZE DATE | |
135 | ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly | |
136 | ## one space character after the TIME and may begin with a space. | |
137 | ## Sequences of the form `\#OOO' where OOO are three octal digits, stand | |
138 | ## for a byte with that value. Newlines and backslashes which would be | |
139 | ## ambiguous are converted into this form; all other characters are | |
140 | ## literal. | |
141 | ## | |
142 | ## We ignore the stat information and retrieve it ourselves, because it's | |
143 | ## incomplete. Hopefully the dcache is still warm. | |
144 | ||
145 | for line in f: | |
146 | if line.endswith('\n'): line = line[:-1] | |
147 | ||
148 | ## Extract the escaped name. | |
149 | ff = line.split(None, 3) | |
150 | if len(ff) != 4: | |
151 | syserr("ignoring invalid line from rsync: `%s'" % line) | |
152 | continue | |
153 | tail = ff[3] | |
154 | try: | |
155 | spc = tail.index(' ') | |
156 | except ValueError: | |
157 | syserr("ignoring invalid line from rsync: `%s'" % line) | |
158 | continue | |
159 | name = tail[spc + 1:] | |
160 | ||
161 | ## Now translate escape sequences. | |
162 | name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name) | |
163 | ||
164 | ## Call the client. | |
165 | try: | |
166 | fi = FileInfo(name) | |
167 | except OSError, err: | |
168 | syserr("failed to stat `%s': %s" % (name, err.strerror)) | |
169 | continue | |
170 | func(fi) | |
171 | ||
172 | ###-------------------------------------------------------------------------- | |
173 | ### The hash cache. | |
174 | ||
175 | class HashCache (object): | |
176 | ||
177 | VERSION = 0 | |
178 | BUFSZ = 128*1024 | |
179 | ||
180 | INIT = [ | |
181 | """CREATE TABLE meta ( | |
182 | version INTEGER NOT NULL, | |
183 | hash TEXT NOT NULL | |
184 | );""", | |
185 | """CREATE TABLE hash ( | |
186 | ino INTEGER PRIMARY KEY, | |
187 | mtime INTEGER NOT NULL, | |
188 | ctime INTEGER NOT NULL, | |
189 | size INTEGER NOT NULL, | |
190 | hash TEXT NOT NULL, | |
191 | seen BOOLEAN NOT NULL DEFAULT TRUE | |
192 | );""", | |
193 | """PRAGMA journal_mode = WAL;""" | |
194 | ] | |
195 | ||
196 | def __init__(me, file, hash = None): | |
197 | ||
198 | if file is None: | |
199 | ||
200 | ## We're going this alone, with no cache. | |
201 | db = None | |
202 | if hash is None: | |
203 | die("no hash specified and no database cache to read from") | |
204 | else: | |
205 | ||
206 | ## Connect to the database. | |
207 | db = DB.connect(file) | |
208 | db.text_factory = str | |
209 | ||
210 | ## See whether we can understand the cache database. | |
211 | c = db.cursor() | |
212 | v = h = None | |
213 | try: | |
214 | c.execute('SELECT version, hash FROM meta') | |
215 | v, h = c.fetchone() | |
216 | if c.fetchone() is not None: | |
217 | die("cache database corrupt: meta table has mutliple rows") | |
218 | except (DB.Error, TypeError): | |
219 | pass | |
220 | ||
221 | ## If that didn't work, we'd better clear the thing and start again. | |
222 | ## But only if we know how to initialize it. | |
223 | if v != me.VERSION: | |
224 | ||
225 | ## Explain the situation. | |
226 | moan("cache version %s not understood" % v) | |
227 | if hash is None: | |
228 | if h is None: | |
229 | die("can't initialize cache: no hash function set") | |
230 | else: | |
231 | hash = h | |
232 | try: | |
233 | H.new(hash) | |
234 | except Exception: | |
235 | die("unknown hash function `%s'" % hash) | |
236 | ||
237 | ## Drop old things. | |
238 | c.execute('SELECT type, name FROM sqlite_master') | |
239 | for type, name in c.fetchall(): | |
240 | c.execute('DROP %s IF EXISTS %s' % (type, name)) | |
241 | ||
242 | ## Now we're ready to go. | |
243 | for stmt in me.INIT: | |
244 | c.execute(stmt) | |
245 | c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash]) | |
246 | db.commit() | |
247 | ||
248 | ## Check the hash function if necessary. | |
249 | if hash is None: | |
250 | hash = h | |
251 | elif h is not None and h != hash: | |
252 | die("hash mismatch: cache uses %s but %s requested" % (h, hash)) | |
253 | ||
254 | ## All done. | |
255 | me.hash = hash | |
256 | me._db = db | |
257 | me._pend = 0 | |
258 | ||
259 | def hashfile(me, fi): | |
260 | ||
261 | ## If this isn't a proper file then don't try to hash it. | |
262 | if fi.err or not ST.S_ISREG(fi.st.st_mode): | |
263 | return None | |
264 | ||
265 | ## See whether there's a valid entry in the cache. | |
266 | if me._db: | |
267 | c = me._db.cursor() | |
268 | c.execute( | |
269 | 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;', | |
270 | [fi.st.st_ino]) | |
271 | r = c.fetchone() | |
272 | if r is not None: | |
273 | mt, sz, h, s = r | |
274 | if mt == fi.st.st_mtime and \ | |
275 | sz == fi.st.st_size: | |
276 | if not s: | |
277 | c.execute('UPDATE hash SET seen = 1 WHERE ino = ?', | |
278 | [fi.st.st_ino]) | |
279 | me._update() | |
280 | return h | |
281 | ||
282 | ## Hash the file. Beware raciness: update the file information from the | |
283 | ## open descriptor, but set the size from what we actually read. | |
284 | h = H.new(me.hash) | |
285 | try: | |
286 | with open(fi.name, 'rb') as f: | |
287 | sz = 0 | |
288 | while True: | |
289 | buf = f.read(me.BUFSZ) | |
290 | if len(buf) == 0: | |
291 | break | |
292 | sz += len(buf) | |
293 | h.update(buf) | |
294 | fi.st = OS.fstat(f.fileno()) | |
295 | ##fi.st.st_size = sz | |
296 | hash = h.digest() | |
297 | except (OSError, IOError), err: | |
298 | fi.st = None | |
299 | fi.err = err | |
300 | return None | |
301 | hash = hash.encode('hex') | |
302 | ||
303 | ## Insert a record into the database. | |
304 | if me._db: | |
305 | c.execute(""" | |
306 | INSERT OR REPLACE INTO hash | |
307 | (ino, mtime, ctime, size, hash, seen) | |
308 | VALUES | |
309 | (?, ?, ?, ?, ?, 1); | |
310 | """, [fi.st.st_ino, | |
311 | fi.st.st_mtime, | |
312 | fi.st.st_ctime, | |
313 | fi.st.st_size, | |
314 | hash]) | |
315 | me._update() | |
316 | ||
317 | ## Done. | |
318 | return hash | |
319 | ||
320 | def _update(me): | |
321 | me._pend += 1 | |
322 | if me._pend >= 1024: | |
323 | me.flush() | |
324 | ||
325 | def flush(me): | |
326 | if me._db: | |
327 | me._db.commit() | |
328 | me._pend = 0 | |
329 | ||
330 | def need_db(me): | |
331 | if not me._db: | |
332 | die("no cache database") | |
333 | ||
334 | def reset(me): | |
335 | me.need_db() | |
336 | c = me._db.cursor() | |
337 | c.execute('UPDATE hash SET seen = 0 WHERE seen') | |
338 | me.flush() | |
339 | ||
340 | def prune(me): | |
341 | me.need_db() | |
342 | c = me._db.cursor() | |
343 | c.execute('DELETE FROM hash WHERE NOT seen') | |
344 | me.flush() | |
345 | ||
346 | ###-------------------------------------------------------------------------- | |
347 | ### Printing output. | |
348 | ||
349 | class GenericFormatter (object): | |
350 | def __init__(me, fi): | |
351 | me.fi = fi | |
352 | def _fmt_time(me, t): | |
353 | tm = T.gmtime(t) | |
354 | return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm) | |
355 | def _enc_name(me, n): | |
8aeb0c53 | 356 | return ' \\-> '.join(n.encode('string_escape').split(' -> ')) |
f6b4ffdc MW |
357 | def name(me): |
358 | return me._enc_name(me.fi.name) | |
359 | def info(me): | |
360 | return me.TYPE | |
361 | def mode(me): | |
362 | return '%06o' % me.fi.st.st_mode | |
363 | def size(me): | |
364 | return me.fi.st.st_size | |
365 | def mtime(me): | |
366 | return me._fmt_time(me.fi.st.st_mtime) | |
367 | def owner(me): | |
368 | return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid) | |
369 | ||
370 | class ErrorFormatter (GenericFormatter): | |
371 | def info(me): | |
372 | return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror) | |
373 | def error(me): return 'error' | |
374 | mode = size = mtime = owner = error | |
375 | ||
376 | class SocketFormatter (GenericFormatter): | |
377 | TYPE = 'socket' | |
378 | class PipeFormatter (GenericFormatter): | |
379 | TYPE = 'fifo' | |
380 | ||
381 | class LinkFormatter (GenericFormatter): | |
382 | TYPE = 'symbolic-link' | |
383 | def name(me): | |
384 | n = GenericFormatter.name(me) | |
385 | try: | |
386 | d = OS.readlink(me.fi.name) | |
387 | return '%s -> %s' % (n, me._enc_name(d)) | |
388 | except OSError, err: | |
389 | return '%s -> <E%d %s>' % (n, err.errno, err.strerror) | |
390 | ||
391 | class DirectoryFormatter (GenericFormatter): | |
392 | TYPE = 'directory' | |
393 | def name(me): return GenericFormatter.name(me) + '/' | |
394 | def size(me): return 'dir' | |
395 | ||
396 | class DeviceFormatter (GenericFormatter): | |
397 | def info(me): | |
398 | return '%s %d:%d' % (me.TYPE, | |
399 | OS.major(me.fi.st.st_rdev), | |
400 | OS.minor(me.fi.st.st_rdev)) | |
401 | class BlockDeviceFormatter (DeviceFormatter): | |
402 | TYPE = 'block-device' | |
403 | class CharDeviceFormatter (DeviceFormatter): | |
404 | TYPE = 'character-device' | |
405 | ||
406 | class FileFormatter (GenericFormatter): | |
407 | TYPE = 'regular-file' | |
408 | ||
409 | class Reporter (object): | |
410 | ||
411 | TYMAP = { | |
412 | ST.S_IFSOCK: SocketFormatter, | |
413 | ST.S_IFDIR: DirectoryFormatter, | |
414 | ST.S_IFLNK: LinkFormatter, | |
415 | ST.S_IFREG: FileFormatter, | |
416 | ST.S_IFBLK: BlockDeviceFormatter, | |
417 | ST.S_IFCHR: CharDeviceFormatter, | |
418 | ST.S_IFIFO: PipeFormatter, | |
419 | } | |
420 | ||
421 | def __init__(me, db): | |
422 | me._inomap = {} | |
423 | me._vinomap = {} | |
424 | me._db = db | |
425 | me._hsz = int(H.new(db.hash).digest_size) | |
426 | ||
427 | def file(me, fi): | |
428 | h = me._db.hashfile(fi) | |
429 | if fi.err: | |
430 | fmt = ErrorFormatter(fi) | |
431 | vino = 'error' | |
432 | else: | |
433 | fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi) | |
434 | inoidx = fi.st.st_dev, fi.st.st_ino | |
435 | try: | |
436 | vino = me._inomap[inoidx] | |
437 | except KeyError: | |
438 | suffix = '' | |
439 | seq = 0 | |
440 | while True: | |
441 | vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff) | |
442 | if vino not in me._vinomap: break | |
443 | suffix = '\0%d' % seq | |
444 | seq += 1 | |
445 | me._inomap[inoidx] = vino | |
446 | if h: info = h | |
447 | else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info()) | |
448 | print '%s %8s %6s %-12s %-20s %20s %s' % ( | |
449 | info, vino, fmt.mode(), fmt.owner(), | |
450 | fmt.mtime(), fmt.size(), fmt.name()) | |
451 | ||
452 | ###-------------------------------------------------------------------------- | |
453 | ### Main program. | |
454 | ||
455 | FMTMAP = { | |
456 | 'rsync': lambda f: enum_rsync(stdin, f), | |
457 | 'find0': lambda f: enum_find0(stdin, f) | |
458 | } | |
459 | op = OP.OptionParser( | |
460 | usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]', | |
461 | version = '%%prog, version %s' % VERSION, | |
462 | description = '''\ | |
463 | Print a digest of a filesystem (or a collection of specified files) to | |
464 | standard output. The idea is that the digest should be mostly /complete/ | |
465 | (i.e., any `interesting\' change to the filesystem results in a different | |
466 | digest) and /canonical/ (i.e., identical filesystem contents result in | |
467 | identical output). | |
468 | ''') | |
469 | ||
470 | for short, long, props in [ | |
471 | ('-a', '--all', { 'action': 'store_true', 'dest': 'all', | |
472 | 'help': 'clear cache of all files not seen' }), | |
473 | ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE', | |
474 | 'help': 'use FILE as a cache for file hashes' }), | |
475 | ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT', | |
476 | 'type': 'choice', 'choices': FMTMAP.keys(), | |
477 | 'help': 'read files to report in the given FORMAT' }), | |
478 | ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH', | |
479 | ##'type': 'choice', 'choices': H.algorithms, | |
480 | 'help': 'use HASH as the hash function' })]: | |
481 | op.add_option(short, long, **props) | |
482 | opts, args = op.parse_args(argv) | |
483 | ||
484 | if not opts.files and len(args) <= 1: | |
485 | die("no filename sources: nothing to do") | |
486 | db = HashCache(opts.cache, opts.hash) | |
487 | if opts.all: | |
488 | db.reset() | |
489 | rep = Reporter(db) | |
490 | if opts.files: | |
491 | FMTMAP[opts.files](rep.file) | |
492 | for dir in args[1:]: | |
493 | enum_walk(dir, rep.file) | |
494 | if opts.all: | |
495 | db.prune() | |
496 | db.flush() | |
497 | ||
498 | ###----- That's all, folks -------------------------------------------------- |