Release 1.1.1.
[rsync-backup] / fshash.in
CommitLineData
f6b4ffdc
MW
1#! @PYTHON@
2###
3### Efficiently construct canonical digests of filesystems
4###
5### (c) 2012 Mark Wooding
6###
7
8###----- Licensing notice ---------------------------------------------------
9###
10### This file is part of the `rsync-backup' program.
11###
12### rsync-backup is free software; you can redistribute it and/or modify
13### it under the terms of the GNU General Public License as published by
14### the Free Software Foundation; either version 2 of the License, or
15### (at your option) any later version.
16###
17### rsync-backup is distributed in the hope that it will be useful,
18### but WITHOUT ANY WARRANTY; without even the implied warranty of
19### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20### GNU General Public License for more details.
21###
22### You should have received a copy of the GNU General Public License
23### along with rsync-backup; if not, write to the Free Software Foundation,
24### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26from sys import argv, exit, stdin, stdout, stderr
27import os as OS
28import re as RX
29import time as T
80d1feec 30import errno as E
f6b4ffdc
MW
31import stat as ST
32import optparse as OP
33import hashlib as H
34import sqlite3 as DB
35import zlib as Z
36
37PACKAGE = '@PACKAGE@'
38VERSION = '@VERSION@'
39
40###--------------------------------------------------------------------------
41### Utilities.
42
43QUIS = OS.path.basename(argv[0])
44
45def moan(msg):
46 stderr.write('%s: %s\n' % (QUIS, msg))
47
48def die(msg, rc = 1):
49 moan(msg)
50 exit(rc)
51
52SYSERR = 0
53def syserr(msg):
54 global SYSERR
55 moan(msg)
56 SYSERR += 1
57
58###--------------------------------------------------------------------------
59### File system enumeration.
60
61class FileInfo (object):
62 def __init__(me, file, st = None):
63 me.name = file
64 if st:
65 me.st = st
66 me.err = None
67 else:
68 try:
69 me.st = OS.lstat(file)
70 me.err = None
71 except OSError, err:
72 me.st = None
73 me.err = err
74
75def enum_walk(file, func):
76
77 def dirents(name):
78 try:
79 return OS.listdir(name)
80 except OSError, err:
81 syserr("failed to read directory `%s': %s" % (name, err.strerror))
82 return []
83
84 def dir(ee, dev):
85 ff = []
86 dd = []
87 for e in ee:
88 fi = FileInfo(e)
89 if fi.st and fi.st.st_dev != dev: pass
90 if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
91 else: ff.append(fi)
92 ff.sort(key = lambda fi: fi.name)
93 dd.sort(key = lambda fi: fi.name + '/')
94 for f in ff:
95 func(f)
96 for d in dd:
97 if d.st.st_dev == dev:
98 func(d)
99 dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
100
101 if file.endswith('/'):
ad7e8534
MW
102 cwd = OS.open('.', OS.O_RDONLY)
103 try:
104 OS.chdir(file)
105 fi = FileInfo('.')
106 func(fi)
107 dir(dirents('.'), fi.st.st_dev)
108 finally:
109 OS.fchdir(cwd)
110 OS.close(cwd)
f6b4ffdc
MW
111 else:
112 fi = FileInfo(file)
113 func(fi)
114 if fi.st and ST.S_ISDIR(fi.st.st_mode):
115 dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
116 fi.st.st_dev)
117
118def enum_find0(f, func):
119 tail = ""
120 while True:
121 buf = f.read(8192)
122 last = len(buf) == 0
123 names = (tail + buf).split('\0')
124 tail = names.pop()
125 for n in names:
126 func(FileInfo(n))
127 if last:
128 break
129 if len(tail):
130 moan("ignored trailing junk after last filename")
131
132RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
133def enum_rsync(f, func):
134
135 ## The format is a little fiddly. Each line consists of PERMS SIZE DATE
136 ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
137 ## one space character after the TIME and may begin with a space.
138 ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
139 ## for a byte with that value. Newlines and backslashes which would be
140 ## ambiguous are converted into this form; all other characters are
141 ## literal.
142 ##
143 ## We ignore the stat information and retrieve it ourselves, because it's
144 ## incomplete. Hopefully the dcache is still warm.
145
146 for line in f:
147 if line.endswith('\n'): line = line[:-1]
148
149 ## Extract the escaped name.
150 ff = line.split(None, 3)
151 if len(ff) != 4:
152 syserr("ignoring invalid line from rsync: `%s'" % line)
153 continue
154 tail = ff[3]
155 try:
156 spc = tail.index(' ')
157 except ValueError:
158 syserr("ignoring invalid line from rsync: `%s'" % line)
159 continue
160 name = tail[spc + 1:]
161
162 ## Now translate escape sequences.
163 name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
164
165 ## Call the client.
166 try:
167 fi = FileInfo(name)
168 except OSError, err:
169 syserr("failed to stat `%s': %s" % (name, err.strerror))
170 continue
171 func(fi)
172
173###--------------------------------------------------------------------------
174### The hash cache.
175
176class HashCache (object):
177
178 VERSION = 0
179 BUFSZ = 128*1024
180
181 INIT = [
182 """CREATE TABLE meta (
183 version INTEGER NOT NULL,
184 hash TEXT NOT NULL
185 );""",
186 """CREATE TABLE hash (
187 ino INTEGER PRIMARY KEY,
188 mtime INTEGER NOT NULL,
189 ctime INTEGER NOT NULL,
190 size INTEGER NOT NULL,
191 hash TEXT NOT NULL,
192 seen BOOLEAN NOT NULL DEFAULT TRUE
193 );""",
194 """PRAGMA journal_mode = WAL;"""
195 ]
196
197 def __init__(me, file, hash = None):
198
199 if file is None:
200
201 ## We're going this alone, with no cache.
202 db = None
203 if hash is None:
204 die("no hash specified and no database cache to read from")
205 else:
206
207 ## Connect to the database.
208 db = DB.connect(file)
209 db.text_factory = str
210
211 ## See whether we can understand the cache database.
212 c = db.cursor()
213 v = h = None
214 try:
215 c.execute('SELECT version, hash FROM meta')
216 v, h = c.fetchone()
217 if c.fetchone() is not None:
218 die("cache database corrupt: meta table has mutliple rows")
219 except (DB.Error, TypeError):
220 pass
221
222 ## If that didn't work, we'd better clear the thing and start again.
223 ## But only if we know how to initialize it.
224 if v != me.VERSION:
225
226 ## Explain the situation.
227 moan("cache version %s not understood" % v)
228 if hash is None:
229 if h is None:
230 die("can't initialize cache: no hash function set")
231 else:
232 hash = h
233 try:
234 H.new(hash)
235 except Exception:
236 die("unknown hash function `%s'" % hash)
237
238 ## Drop old things.
239 c.execute('SELECT type, name FROM sqlite_master')
240 for type, name in c.fetchall():
241 c.execute('DROP %s IF EXISTS %s' % (type, name))
242
243 ## Now we're ready to go.
244 for stmt in me.INIT:
245 c.execute(stmt)
246 c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
247 db.commit()
248
249 ## Check the hash function if necessary.
250 if hash is None:
251 hash = h
252 elif h is not None and h != hash:
253 die("hash mismatch: cache uses %s but %s requested" % (h, hash))
254
255 ## All done.
256 me.hash = hash
257 me._db = db
258 me._pend = 0
259
260 def hashfile(me, fi):
261
262 ## If this isn't a proper file then don't try to hash it.
263 if fi.err or not ST.S_ISREG(fi.st.st_mode):
264 return None
265
266 ## See whether there's a valid entry in the cache.
267 if me._db:
268 c = me._db.cursor()
269 c.execute(
270 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
271 [fi.st.st_ino])
272 r = c.fetchone()
273 if r is not None:
274 mt, sz, h, s = r
275 if mt == fi.st.st_mtime and \
276 sz == fi.st.st_size:
277 if not s:
278 c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
279 [fi.st.st_ino])
280 me._update()
281 return h
282
283 ## Hash the file. Beware raciness: update the file information from the
284 ## open descriptor, but set the size from what we actually read.
285 h = H.new(me.hash)
286 try:
287 with open(fi.name, 'rb') as f:
288 sz = 0
289 while True:
290 buf = f.read(me.BUFSZ)
291 if len(buf) == 0:
292 break
293 sz += len(buf)
294 h.update(buf)
295 fi.st = OS.fstat(f.fileno())
296 ##fi.st.st_size = sz
297 hash = h.digest()
298 except (OSError, IOError), err:
299 fi.st = None
300 fi.err = err
301 return None
302 hash = hash.encode('hex')
303
304 ## Insert a record into the database.
305 if me._db:
306 c.execute("""
307 INSERT OR REPLACE INTO hash
308 (ino, mtime, ctime, size, hash, seen)
309 VALUES
310 (?, ?, ?, ?, ?, 1);
311 """, [fi.st.st_ino,
312 fi.st.st_mtime,
313 fi.st.st_ctime,
314 fi.st.st_size,
315 hash])
316 me._update()
317
318 ## Done.
319 return hash
320
321 def _update(me):
322 me._pend += 1
323 if me._pend >= 1024:
324 me.flush()
325
326 def flush(me):
327 if me._db:
328 me._db.commit()
329 me._pend = 0
330
331 def need_db(me):
332 if not me._db:
333 die("no cache database")
334
80d1feec
MW
335 def forget(me, ino):
336 me.need_db()
337 c = me._db.cursor()
338 c.execute('DELETE FROM hash WHERE ino = ?', [ino])
339
f6b4ffdc
MW
340 def reset(me):
341 me.need_db()
342 c = me._db.cursor()
343 c.execute('UPDATE hash SET seen = 0 WHERE seen')
344 me.flush()
345
346 def prune(me):
347 me.need_db()
348 c = me._db.cursor()
349 c.execute('DELETE FROM hash WHERE NOT seen')
350 me.flush()
351
352###--------------------------------------------------------------------------
353### Printing output.
354
355class GenericFormatter (object):
356 def __init__(me, fi):
357 me.fi = fi
358 def _fmt_time(me, t):
359 tm = T.gmtime(t)
360 return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
361 def _enc_name(me, n):
8aeb0c53 362 return ' \\-> '.join(n.encode('string_escape').split(' -> '))
f6b4ffdc
MW
363 def name(me):
364 return me._enc_name(me.fi.name)
365 def info(me):
366 return me.TYPE
367 def mode(me):
368 return '%06o' % me.fi.st.st_mode
369 def size(me):
370 return me.fi.st.st_size
371 def mtime(me):
372 return me._fmt_time(me.fi.st.st_mtime)
373 def owner(me):
374 return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
375
376class ErrorFormatter (GenericFormatter):
377 def info(me):
378 return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
379 def error(me): return 'error'
380 mode = size = mtime = owner = error
381
382class SocketFormatter (GenericFormatter):
383 TYPE = 'socket'
384class PipeFormatter (GenericFormatter):
385 TYPE = 'fifo'
386
387class LinkFormatter (GenericFormatter):
388 TYPE = 'symbolic-link'
389 def name(me):
390 n = GenericFormatter.name(me)
391 try:
392 d = OS.readlink(me.fi.name)
393 return '%s -> %s' % (n, me._enc_name(d))
394 except OSError, err:
395 return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
396
397class DirectoryFormatter (GenericFormatter):
398 TYPE = 'directory'
399 def name(me): return GenericFormatter.name(me) + '/'
400 def size(me): return 'dir'
401
402class DeviceFormatter (GenericFormatter):
403 def info(me):
404 return '%s %d:%d' % (me.TYPE,
405 OS.major(me.fi.st.st_rdev),
406 OS.minor(me.fi.st.st_rdev))
407class BlockDeviceFormatter (DeviceFormatter):
408 TYPE = 'block-device'
409class CharDeviceFormatter (DeviceFormatter):
410 TYPE = 'character-device'
411
412class FileFormatter (GenericFormatter):
413 TYPE = 'regular-file'
414
415class Reporter (object):
416
417 TYMAP = {
418 ST.S_IFSOCK: SocketFormatter,
419 ST.S_IFDIR: DirectoryFormatter,
420 ST.S_IFLNK: LinkFormatter,
421 ST.S_IFREG: FileFormatter,
422 ST.S_IFBLK: BlockDeviceFormatter,
423 ST.S_IFCHR: CharDeviceFormatter,
424 ST.S_IFIFO: PipeFormatter,
425 }
426
427 def __init__(me, db):
428 me._inomap = {}
429 me._vinomap = {}
430 me._db = db
431 me._hsz = int(H.new(db.hash).digest_size)
432
433 def file(me, fi):
434 h = me._db.hashfile(fi)
435 if fi.err:
436 fmt = ErrorFormatter(fi)
437 vino = 'error'
438 else:
439 fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
440 inoidx = fi.st.st_dev, fi.st.st_ino
441 try:
442 vino = me._inomap[inoidx]
443 except KeyError:
444 suffix = ''
445 seq = 0
446 while True:
447 vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
448 if vino not in me._vinomap: break
449 suffix = '\0%d' % seq
450 seq += 1
451 me._inomap[inoidx] = vino
452 if h: info = h
453 else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
454 print '%s %8s %6s %-12s %-20s %20s %s' % (
455 info, vino, fmt.mode(), fmt.owner(),
456 fmt.mtime(), fmt.size(), fmt.name())
457
458###--------------------------------------------------------------------------
80d1feec
MW
459### Database clearing from diff files.
460
461R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
462
463def clear_entry(db, lno, line):
464
465 good = True
466
467 if line.startswith('['):
468 pos = line.find(']')
469 if pos < 0:
470 moan("failed to parse file entry (type field; line %d)" % lno)
471 return False
472 ty = line[1:pos].strip()
473 rest = line[pos + 1:]
474 hash = None
475 else:
476 ff = line.split(None, 1)
477 if len(ff) != 2:
478 moan("failed to parse file entry (field split; line %d)" % lno)
479 return False
480 ty = 'regular-file'
481 hash, rest = ff
482
483 ff = rest.split(None, 5)
484 if len(ff) != 6:
485 moan("failed to parse file entry (field split; line %d)" % lno)
486 return False
487 ino, mode, uidgid, mtime, sz, name = ff
488
489 if ty != 'symbolic-link':
490 target = None
491 else:
492 nn = name.split(' -> ', 1)
493 if len(nn) != 2:
494 moan("failed to parse file entry (name split; line %d)" % lno)
495 return False
496 name, target = nn
497 target = target.decode('string_escape')
498 name = name.decode('string_escape')
499
500 try:
501 st = OS.lstat(name)
502 except OSError, e:
503 moan("failed to stat `%s': %s" % (name, e.strerror))
504 if e.errno != E.ENOENT: good = False
505 else:
506 print "Clear cache entry for `%s'" % name
507 db.forget(st.st_ino)
508
509 return good
510
511def clear_cache(db):
512
513 ## Work through the input diff file one line at a time.
514 diffstate = 'gap'
515 lno = 0
516 good = True
517 for line in stdin:
518 if line.endswith('\n'): line = line[:-1]
519 lno += 1
520
521 ## We're in a gap between hunks. Find a hunk header and extract the line
522 ## counts.
523 if diffstate == 'gap':
524 m = R_HUNK.match(line)
525 if m:
526 oldlines = int(m.group(1))
527 newlines = int(m.group(2))
528 diffstate = 'hunk'
529 hdrlno = lno
530
531 ## We're in a hunk. Keep track of whether we've reached the end, and
532 ## discard entries from the cache for mismatching lines.
533 elif diffstate == 'hunk':
534 if len(line) == 0:
535 moan("empty line in diff hunk (line %d)" % lno)
536 good = False
537 ty = line[0]
538 if ty == ' ':
539 oldlines -= 1; newlines -= 1
540 elif ty == '+':
541 newlines -= 1
542 if not clear_entry(db, lno, line[1:]): good = False
543 elif ty == '-':
544 oldlines -= 1
545 if not clear_entry(db, lno, line[1:]): good = False
546 else:
547 moan("incomprehensible line in diff hunk (line %d)" % lno)
548 good = false
549 if oldlines < 0 or newlines < 0:
550 moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
551 good = False
552 if oldlines == newlines == 0:
553 diffstate = 'gap'
554
555 if diffstate == 'hunk':
556 moan("truncated diff hunk (started at line %d)" % hdrlno)
557 good = False
558
559 return good
560
561###--------------------------------------------------------------------------
f6b4ffdc
MW
562### Main program.
563
564FMTMAP = {
565 'rsync': lambda f: enum_rsync(stdin, f),
566 'find0': lambda f: enum_find0(stdin, f)
567}
568op = OP.OptionParser(
80d1feec 569 usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
f6b4ffdc
MW
570 version = '%%prog, version %s' % VERSION,
571 description = '''\
572Print a digest of a filesystem (or a collection of specified files) to
573standard output. The idea is that the digest should be mostly /complete/
574(i.e., any `interesting\' change to the filesystem results in a different
575digest) and /canonical/ (i.e., identical filesystem contents result in
576identical output).
577''')
578
579for short, long, props in [
580 ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
581 'help': 'clear cache of all files not seen' }),
582 ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
583 'help': 'use FILE as a cache for file hashes' }),
584 ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
585 'type': 'choice', 'choices': FMTMAP.keys(),
586 'help': 'read files to report in the given FORMAT' }),
80d1feec
MW
587 ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
588 'help': 'read diff from stdin, clear cache entries' }),
f6b4ffdc
MW
589 ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
590 ##'type': 'choice', 'choices': H.algorithms,
591 'help': 'use HASH as the hash function' })]:
592 op.add_option(short, long, **props)
593opts, args = op.parse_args(argv)
594
80d1feec
MW
595if opts.udiff:
596 if opts.cache is None or opts.all or opts.files or len(args) > 2:
597 die("incompatible options: `-u' requires `-c CACHE', forbids others")
598 db = HashCache(opts.cache, opts.hash)
599 if len(args) == 2: OS.chdir(args[1])
600 good = True
601 if not clear_cache(db): good = False
602 if good: db.flush()
603 else: exit(2)
604else:
605 if not opts.files and len(args) <= 1:
606 die("no filename sources: nothing to do")
607 db = HashCache(opts.cache, opts.hash)
608 if opts.all:
609 db.reset()
610 rep = Reporter(db)
611 if opts.files:
612 FMTMAP[opts.files](rep.file)
613 for dir in args[1:]:
614 enum_walk(dir, rep.file)
615 if opts.all:
616 db.prune()
617 db.flush()
f6b4ffdc
MW
618
619###----- That's all, folks --------------------------------------------------