Commit | Line | Data |
---|---|---|
f6b4ffdc MW |
1 | #! @PYTHON@ |
2 | ### | |
3 | ### Efficiently construct canonical digests of filesystems | |
4 | ### | |
5 | ### (c) 2012 Mark Wooding | |
6 | ### | |
7 | ||
8 | ###----- Licensing notice --------------------------------------------------- | |
9 | ### | |
10 | ### This file is part of the `rsync-backup' program. | |
11 | ### | |
12 | ### rsync-backup is free software; you can redistribute it and/or modify | |
13 | ### it under the terms of the GNU General Public License as published by | |
14 | ### the Free Software Foundation; either version 2 of the License, or | |
15 | ### (at your option) any later version. | |
16 | ### | |
17 | ### rsync-backup is distributed in the hope that it will be useful, | |
18 | ### but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ### GNU General Public License for more details. | |
21 | ### | |
22 | ### You should have received a copy of the GNU General Public License | |
23 | ### along with rsync-backup; if not, write to the Free Software Foundation, | |
24 | ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
25 | ||
26 | from sys import argv, exit, stdin, stdout, stderr | |
27 | import os as OS | |
28 | import re as RX | |
29 | import time as T | |
30 | import stat as ST | |
31 | import optparse as OP | |
32 | import hashlib as H | |
33 | import sqlite3 as DB | |
34 | import zlib as Z | |
35 | ||
36 | PACKAGE = '@PACKAGE@' | |
37 | VERSION = '@VERSION@' | |
38 | ||
39 | ###-------------------------------------------------------------------------- | |
40 | ### Utilities. | |
41 | ||
42 | QUIS = OS.path.basename(argv[0]) | |
43 | ||
44 | def moan(msg): | |
45 | stderr.write('%s: %s\n' % (QUIS, msg)) | |
46 | ||
47 | def die(msg, rc = 1): | |
48 | moan(msg) | |
49 | exit(rc) | |
50 | ||
51 | SYSERR = 0 | |
52 | def syserr(msg): | |
53 | global SYSERR | |
54 | moan(msg) | |
55 | SYSERR += 1 | |
56 | ||
57 | ###-------------------------------------------------------------------------- | |
58 | ### File system enumeration. | |
59 | ||
60 | class FileInfo (object): | |
61 | def __init__(me, file, st = None): | |
62 | me.name = file | |
63 | if st: | |
64 | me.st = st | |
65 | me.err = None | |
66 | else: | |
67 | try: | |
68 | me.st = OS.lstat(file) | |
69 | me.err = None | |
70 | except OSError, err: | |
71 | me.st = None | |
72 | me.err = err | |
73 | ||
74 | def enum_walk(file, func): | |
75 | ||
76 | def dirents(name): | |
77 | try: | |
78 | return OS.listdir(name) | |
79 | except OSError, err: | |
80 | syserr("failed to read directory `%s': %s" % (name, err.strerror)) | |
81 | return [] | |
82 | ||
83 | def dir(ee, dev): | |
84 | ff = [] | |
85 | dd = [] | |
86 | for e in ee: | |
87 | fi = FileInfo(e) | |
88 | if fi.st and fi.st.st_dev != dev: pass | |
89 | if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi) | |
90 | else: ff.append(fi) | |
91 | ff.sort(key = lambda fi: fi.name) | |
92 | dd.sort(key = lambda fi: fi.name + '/') | |
93 | for f in ff: | |
94 | func(f) | |
95 | for d in dd: | |
96 | if d.st.st_dev == dev: | |
97 | func(d) | |
98 | dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev) | |
99 | ||
100 | if file.endswith('/'): | |
101 | OS.chdir(file) | |
102 | fi = FileInfo('.') | |
103 | func(fi) | |
104 | dir(dirents('.'), fi.st.st_dev) | |
105 | else: | |
106 | fi = FileInfo(file) | |
107 | func(fi) | |
108 | if fi.st and ST.S_ISDIR(fi.st.st_mode): | |
109 | dir([OS.path.join(fi.name, e) for e in dirents(fi.name)], | |
110 | fi.st.st_dev) | |
111 | ||
112 | def enum_find0(f, func): | |
113 | tail = "" | |
114 | while True: | |
115 | buf = f.read(8192) | |
116 | last = len(buf) == 0 | |
117 | names = (tail + buf).split('\0') | |
118 | tail = names.pop() | |
119 | for n in names: | |
120 | func(FileInfo(n)) | |
121 | if last: | |
122 | break | |
123 | if len(tail): | |
124 | moan("ignored trailing junk after last filename") | |
125 | ||
126 | RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE) | |
127 | def enum_rsync(f, func): | |
128 | ||
129 | ## The format is a little fiddly. Each line consists of PERMS SIZE DATE | |
130 | ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly | |
131 | ## one space character after the TIME and may begin with a space. | |
132 | ## Sequences of the form `\#OOO' where OOO are three octal digits, stand | |
133 | ## for a byte with that value. Newlines and backslashes which would be | |
134 | ## ambiguous are converted into this form; all other characters are | |
135 | ## literal. | |
136 | ## | |
137 | ## We ignore the stat information and retrieve it ourselves, because it's | |
138 | ## incomplete. Hopefully the dcache is still warm. | |
139 | ||
140 | for line in f: | |
141 | if line.endswith('\n'): line = line[:-1] | |
142 | ||
143 | ## Extract the escaped name. | |
144 | ff = line.split(None, 3) | |
145 | if len(ff) != 4: | |
146 | syserr("ignoring invalid line from rsync: `%s'" % line) | |
147 | continue | |
148 | tail = ff[3] | |
149 | try: | |
150 | spc = tail.index(' ') | |
151 | except ValueError: | |
152 | syserr("ignoring invalid line from rsync: `%s'" % line) | |
153 | continue | |
154 | name = tail[spc + 1:] | |
155 | ||
156 | ## Now translate escape sequences. | |
157 | name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name) | |
158 | ||
159 | ## Call the client. | |
160 | try: | |
161 | fi = FileInfo(name) | |
162 | except OSError, err: | |
163 | syserr("failed to stat `%s': %s" % (name, err.strerror)) | |
164 | continue | |
165 | func(fi) | |
166 | ||
167 | ###-------------------------------------------------------------------------- | |
168 | ### The hash cache. | |
169 | ||
170 | class HashCache (object): | |
171 | ||
172 | VERSION = 0 | |
173 | BUFSZ = 128*1024 | |
174 | ||
175 | INIT = [ | |
176 | """CREATE TABLE meta ( | |
177 | version INTEGER NOT NULL, | |
178 | hash TEXT NOT NULL | |
179 | );""", | |
180 | """CREATE TABLE hash ( | |
181 | ino INTEGER PRIMARY KEY, | |
182 | mtime INTEGER NOT NULL, | |
183 | ctime INTEGER NOT NULL, | |
184 | size INTEGER NOT NULL, | |
185 | hash TEXT NOT NULL, | |
186 | seen BOOLEAN NOT NULL DEFAULT TRUE | |
187 | );""", | |
188 | """PRAGMA journal_mode = WAL;""" | |
189 | ] | |
190 | ||
191 | def __init__(me, file, hash = None): | |
192 | ||
193 | if file is None: | |
194 | ||
195 | ## We're going this alone, with no cache. | |
196 | db = None | |
197 | if hash is None: | |
198 | die("no hash specified and no database cache to read from") | |
199 | else: | |
200 | ||
201 | ## Connect to the database. | |
202 | db = DB.connect(file) | |
203 | db.text_factory = str | |
204 | ||
205 | ## See whether we can understand the cache database. | |
206 | c = db.cursor() | |
207 | v = h = None | |
208 | try: | |
209 | c.execute('SELECT version, hash FROM meta') | |
210 | v, h = c.fetchone() | |
211 | if c.fetchone() is not None: | |
212 | die("cache database corrupt: meta table has mutliple rows") | |
213 | except (DB.Error, TypeError): | |
214 | pass | |
215 | ||
216 | ## If that didn't work, we'd better clear the thing and start again. | |
217 | ## But only if we know how to initialize it. | |
218 | if v != me.VERSION: | |
219 | ||
220 | ## Explain the situation. | |
221 | moan("cache version %s not understood" % v) | |
222 | if hash is None: | |
223 | if h is None: | |
224 | die("can't initialize cache: no hash function set") | |
225 | else: | |
226 | hash = h | |
227 | try: | |
228 | H.new(hash) | |
229 | except Exception: | |
230 | die("unknown hash function `%s'" % hash) | |
231 | ||
232 | ## Drop old things. | |
233 | c.execute('SELECT type, name FROM sqlite_master') | |
234 | for type, name in c.fetchall(): | |
235 | c.execute('DROP %s IF EXISTS %s' % (type, name)) | |
236 | ||
237 | ## Now we're ready to go. | |
238 | for stmt in me.INIT: | |
239 | c.execute(stmt) | |
240 | c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash]) | |
241 | db.commit() | |
242 | ||
243 | ## Check the hash function if necessary. | |
244 | if hash is None: | |
245 | hash = h | |
246 | elif h is not None and h != hash: | |
247 | die("hash mismatch: cache uses %s but %s requested" % (h, hash)) | |
248 | ||
249 | ## All done. | |
250 | me.hash = hash | |
251 | me._db = db | |
252 | me._pend = 0 | |
253 | ||
254 | def hashfile(me, fi): | |
255 | ||
256 | ## If this isn't a proper file then don't try to hash it. | |
257 | if fi.err or not ST.S_ISREG(fi.st.st_mode): | |
258 | return None | |
259 | ||
260 | ## See whether there's a valid entry in the cache. | |
261 | if me._db: | |
262 | c = me._db.cursor() | |
263 | c.execute( | |
264 | 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;', | |
265 | [fi.st.st_ino]) | |
266 | r = c.fetchone() | |
267 | if r is not None: | |
268 | mt, sz, h, s = r | |
269 | if mt == fi.st.st_mtime and \ | |
270 | sz == fi.st.st_size: | |
271 | if not s: | |
272 | c.execute('UPDATE hash SET seen = 1 WHERE ino = ?', | |
273 | [fi.st.st_ino]) | |
274 | me._update() | |
275 | return h | |
276 | ||
277 | ## Hash the file. Beware raciness: update the file information from the | |
278 | ## open descriptor, but set the size from what we actually read. | |
279 | h = H.new(me.hash) | |
280 | try: | |
281 | with open(fi.name, 'rb') as f: | |
282 | sz = 0 | |
283 | while True: | |
284 | buf = f.read(me.BUFSZ) | |
285 | if len(buf) == 0: | |
286 | break | |
287 | sz += len(buf) | |
288 | h.update(buf) | |
289 | fi.st = OS.fstat(f.fileno()) | |
290 | ##fi.st.st_size = sz | |
291 | hash = h.digest() | |
292 | except (OSError, IOError), err: | |
293 | fi.st = None | |
294 | fi.err = err | |
295 | return None | |
296 | hash = hash.encode('hex') | |
297 | ||
298 | ## Insert a record into the database. | |
299 | if me._db: | |
300 | c.execute(""" | |
301 | INSERT OR REPLACE INTO hash | |
302 | (ino, mtime, ctime, size, hash, seen) | |
303 | VALUES | |
304 | (?, ?, ?, ?, ?, 1); | |
305 | """, [fi.st.st_ino, | |
306 | fi.st.st_mtime, | |
307 | fi.st.st_ctime, | |
308 | fi.st.st_size, | |
309 | hash]) | |
310 | me._update() | |
311 | ||
312 | ## Done. | |
313 | return hash | |
314 | ||
315 | def _update(me): | |
316 | me._pend += 1 | |
317 | if me._pend >= 1024: | |
318 | me.flush() | |
319 | ||
320 | def flush(me): | |
321 | if me._db: | |
322 | me._db.commit() | |
323 | me._pend = 0 | |
324 | ||
325 | def need_db(me): | |
326 | if not me._db: | |
327 | die("no cache database") | |
328 | ||
329 | def reset(me): | |
330 | me.need_db() | |
331 | c = me._db.cursor() | |
332 | c.execute('UPDATE hash SET seen = 0 WHERE seen') | |
333 | me.flush() | |
334 | ||
335 | def prune(me): | |
336 | me.need_db() | |
337 | c = me._db.cursor() | |
338 | c.execute('DELETE FROM hash WHERE NOT seen') | |
339 | me.flush() | |
340 | ||
341 | ###-------------------------------------------------------------------------- | |
342 | ### Printing output. | |
343 | ||
344 | class GenericFormatter (object): | |
345 | def __init__(me, fi): | |
346 | me.fi = fi | |
347 | def _fmt_time(me, t): | |
348 | tm = T.gmtime(t) | |
349 | return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm) | |
350 | def _enc_name(me, n): | |
8aeb0c53 | 351 | return ' \\-> '.join(n.encode('string_escape').split(' -> ')) |
f6b4ffdc MW |
352 | def name(me): |
353 | return me._enc_name(me.fi.name) | |
354 | def info(me): | |
355 | return me.TYPE | |
356 | def mode(me): | |
357 | return '%06o' % me.fi.st.st_mode | |
358 | def size(me): | |
359 | return me.fi.st.st_size | |
360 | def mtime(me): | |
361 | return me._fmt_time(me.fi.st.st_mtime) | |
362 | def owner(me): | |
363 | return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid) | |
364 | ||
365 | class ErrorFormatter (GenericFormatter): | |
366 | def info(me): | |
367 | return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror) | |
368 | def error(me): return 'error' | |
369 | mode = size = mtime = owner = error | |
370 | ||
371 | class SocketFormatter (GenericFormatter): | |
372 | TYPE = 'socket' | |
373 | class PipeFormatter (GenericFormatter): | |
374 | TYPE = 'fifo' | |
375 | ||
376 | class LinkFormatter (GenericFormatter): | |
377 | TYPE = 'symbolic-link' | |
378 | def name(me): | |
379 | n = GenericFormatter.name(me) | |
380 | try: | |
381 | d = OS.readlink(me.fi.name) | |
382 | return '%s -> %s' % (n, me._enc_name(d)) | |
383 | except OSError, err: | |
384 | return '%s -> <E%d %s>' % (n, err.errno, err.strerror) | |
385 | ||
386 | class DirectoryFormatter (GenericFormatter): | |
387 | TYPE = 'directory' | |
388 | def name(me): return GenericFormatter.name(me) + '/' | |
389 | def size(me): return 'dir' | |
390 | ||
391 | class DeviceFormatter (GenericFormatter): | |
392 | def info(me): | |
393 | return '%s %d:%d' % (me.TYPE, | |
394 | OS.major(me.fi.st.st_rdev), | |
395 | OS.minor(me.fi.st.st_rdev)) | |
396 | class BlockDeviceFormatter (DeviceFormatter): | |
397 | TYPE = 'block-device' | |
398 | class CharDeviceFormatter (DeviceFormatter): | |
399 | TYPE = 'character-device' | |
400 | ||
401 | class FileFormatter (GenericFormatter): | |
402 | TYPE = 'regular-file' | |
403 | ||
404 | class Reporter (object): | |
405 | ||
406 | TYMAP = { | |
407 | ST.S_IFSOCK: SocketFormatter, | |
408 | ST.S_IFDIR: DirectoryFormatter, | |
409 | ST.S_IFLNK: LinkFormatter, | |
410 | ST.S_IFREG: FileFormatter, | |
411 | ST.S_IFBLK: BlockDeviceFormatter, | |
412 | ST.S_IFCHR: CharDeviceFormatter, | |
413 | ST.S_IFIFO: PipeFormatter, | |
414 | } | |
415 | ||
416 | def __init__(me, db): | |
417 | me._inomap = {} | |
418 | me._vinomap = {} | |
419 | me._db = db | |
420 | me._hsz = int(H.new(db.hash).digest_size) | |
421 | ||
422 | def file(me, fi): | |
423 | h = me._db.hashfile(fi) | |
424 | if fi.err: | |
425 | fmt = ErrorFormatter(fi) | |
426 | vino = 'error' | |
427 | else: | |
428 | fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi) | |
429 | inoidx = fi.st.st_dev, fi.st.st_ino | |
430 | try: | |
431 | vino = me._inomap[inoidx] | |
432 | except KeyError: | |
433 | suffix = '' | |
434 | seq = 0 | |
435 | while True: | |
436 | vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff) | |
437 | if vino not in me._vinomap: break | |
438 | suffix = '\0%d' % seq | |
439 | seq += 1 | |
440 | me._inomap[inoidx] = vino | |
441 | if h: info = h | |
442 | else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info()) | |
443 | print '%s %8s %6s %-12s %-20s %20s %s' % ( | |
444 | info, vino, fmt.mode(), fmt.owner(), | |
445 | fmt.mtime(), fmt.size(), fmt.name()) | |
446 | ||
447 | ###-------------------------------------------------------------------------- | |
448 | ### Main program. | |
449 | ||
450 | FMTMAP = { | |
451 | 'rsync': lambda f: enum_rsync(stdin, f), | |
452 | 'find0': lambda f: enum_find0(stdin, f) | |
453 | } | |
454 | op = OP.OptionParser( | |
455 | usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]', | |
456 | version = '%%prog, version %s' % VERSION, | |
457 | description = '''\ | |
458 | Print a digest of a filesystem (or a collection of specified files) to | |
459 | standard output. The idea is that the digest should be mostly /complete/ | |
460 | (i.e., any `interesting\' change to the filesystem results in a different | |
461 | digest) and /canonical/ (i.e., identical filesystem contents result in | |
462 | identical output). | |
463 | ''') | |
464 | ||
465 | for short, long, props in [ | |
466 | ('-a', '--all', { 'action': 'store_true', 'dest': 'all', | |
467 | 'help': 'clear cache of all files not seen' }), | |
468 | ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE', | |
469 | 'help': 'use FILE as a cache for file hashes' }), | |
470 | ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT', | |
471 | 'type': 'choice', 'choices': FMTMAP.keys(), | |
472 | 'help': 'read files to report in the given FORMAT' }), | |
473 | ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH', | |
474 | ##'type': 'choice', 'choices': H.algorithms, | |
475 | 'help': 'use HASH as the hash function' })]: | |
476 | op.add_option(short, long, **props) | |
477 | opts, args = op.parse_args(argv) | |
478 | ||
479 | if not opts.files and len(args) <= 1: | |
480 | die("no filename sources: nothing to do") | |
481 | db = HashCache(opts.cache, opts.hash) | |
482 | if opts.all: | |
483 | db.reset() | |
484 | rep = Reporter(db) | |
485 | if opts.files: | |
486 | FMTMAP[opts.files](rep.file) | |
487 | for dir in args[1:]: | |
488 | enum_walk(dir, rep.file) | |
489 | if opts.all: | |
490 | db.prune() | |
491 | db.flush() | |
492 | ||
493 | ###----- That's all, folks -------------------------------------------------- |