--- /dev/null
+#! /usr/bin/python
+###
+### Efficiently construct canonical digests of filesystems
+###
+### (c) 2012 Mark Wooding
+###
+
+###----- Licensing notice ---------------------------------------------------
+###
+### This file is part of the `rsync-backup' program.
+###
+### rsync-backup is free software; you can redistribute it and/or modify
+### it under the terms of the GNU General Public License as published by
+### the Free Software Foundation; either version 2 of the License, or
+### (at your option) any later version.
+###
+### rsync-backup is distributed in the hope that it will be useful,
+### but WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+### GNU General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with rsync-backup; if not, write to the Free Software Foundation,
+### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+from sys import argv, exit, stdin, stdout, stderr
+import os as OS
+import re as RX
+import time as T
+import stat as ST
+import optparse as OP
+import hashlib as H
+import sqlite3 as DB
+import zlib as Z
+
+PACKAGE = 'rsync-backup'
+VERSION = '0.99.1-8-ga844'
+
+###--------------------------------------------------------------------------
+### Utilities.
+
+QUIS = OS.path.basename(argv[0])
+
+def moan(msg):
+ stderr.write('%s: %s\n' % (QUIS, msg))
+
+def die(msg, rc = 1):
+ moan(msg)
+ exit(rc)
+
+SYSERR = 0
+def syserr(msg):
+ global SYSERR
+ moan(msg)
+ SYSERR += 1
+
+###--------------------------------------------------------------------------
+### File system enumeration.
+
+class FileInfo (object):
+ def __init__(me, file, st = None):
+ me.name = file
+ if st:
+ me.st = st
+ me.err = None
+ else:
+ try:
+ me.st = OS.lstat(file)
+ me.err = None
+ except OSError, err:
+ me.st = None
+ me.err = err
+
+def enum_walk(file, func):
+
+ def dirents(name):
+ try:
+ return OS.listdir(name)
+ except OSError, err:
+ syserr("failed to read directory `%s': %s" % (name, err.strerror))
+ return []
+
+ def dir(ee, dev):
+ ff = []
+ dd = []
+ for e in ee:
+ fi = FileInfo(e)
+ if fi.st and fi.st.st_dev != dev: pass
+ if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
+ else: ff.append(fi)
+ ff.sort(key = lambda fi: fi.name)
+ dd.sort(key = lambda fi: fi.name + '/')
+ for f in ff:
+ func(f)
+ for d in dd:
+ if d.st.st_dev == dev:
+ func(d)
+ dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
+
+ if file.endswith('/'):
+ cwd = OS.open('.', OS.O_RDONLY)
+ try:
+ OS.chdir(file)
+ fi = FileInfo('.')
+ func(fi)
+ dir(dirents('.'), fi.st.st_dev)
+ finally:
+ OS.fchdir(cwd)
+ OS.close(cwd)
+ else:
+ fi = FileInfo(file)
+ func(fi)
+ if fi.st and ST.S_ISDIR(fi.st.st_mode):
+ dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
+ fi.st.st_dev)
+
+def enum_find0(f, func):
+ tail = ""
+ while True:
+ buf = f.read(8192)
+ last = len(buf) == 0
+ names = (tail + buf).split('\0')
+ tail = names.pop()
+ for n in names:
+ func(FileInfo(n))
+ if last:
+ break
+ if len(tail):
+ moan("ignored trailing junk after last filename")
+
+RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
+def enum_rsync(f, func):
+
+ ## The format is a little fiddly. Each line consists of PERMS SIZE DATE
+ ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
+ ## one space character after the TIME and may begin with a space.
+ ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
+ ## for a byte with that value. Newlines and backslashes which would be
+ ## ambiguous are converted into this form; all other characters are
+ ## literal.
+ ##
+ ## We ignore the stat information and retrieve it ourselves, because it's
+ ## incomplete. Hopefully the dcache is still warm.
+
+ for line in f:
+ if line.endswith('\n'): line = line[:-1]
+
+ ## Extract the escaped name.
+ ff = line.split(None, 3)
+ if len(ff) != 4:
+ syserr("ignoring invalid line from rsync: `%s'" % line)
+ continue
+ tail = ff[3]
+ try:
+ spc = tail.index(' ')
+ except ValueError:
+ syserr("ignoring invalid line from rsync: `%s'" % line)
+ continue
+ name = tail[spc + 1:]
+
+ ## Now translate escape sequences.
+ name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
+
+ ## Call the client.
+ try:
+ fi = FileInfo(name)
+ except OSError, err:
+ syserr("failed to stat `%s': %s" % (name, err.strerror))
+ continue
+ func(fi)
+
+###--------------------------------------------------------------------------
+### The hash cache.
+
+class HashCache (object):
+
+ VERSION = 0
+ BUFSZ = 128*1024
+
+ INIT = [
+ """CREATE TABLE meta (
+ version INTEGER NOT NULL,
+ hash TEXT NOT NULL
+ );""",
+ """CREATE TABLE hash (
+ ino INTEGER PRIMARY KEY,
+ mtime INTEGER NOT NULL,
+ ctime INTEGER NOT NULL,
+ size INTEGER NOT NULL,
+ hash TEXT NOT NULL,
+ seen BOOLEAN NOT NULL DEFAULT TRUE
+ );""",
+ """PRAGMA journal_mode = WAL;"""
+ ]
+
+ def __init__(me, file, hash = None):
+
+ if file is None:
+
+ ## We're going this alone, with no cache.
+ db = None
+ if hash is None:
+ die("no hash specified and no database cache to read from")
+ else:
+
+ ## Connect to the database.
+ db = DB.connect(file)
+ db.text_factory = str
+
+ ## See whether we can understand the cache database.
+ c = db.cursor()
+ v = h = None
+ try:
+ c.execute('SELECT version, hash FROM meta')
+ v, h = c.fetchone()
+ if c.fetchone() is not None:
+ die("cache database corrupt: meta table has mutliple rows")
+ except (DB.Error, TypeError):
+ pass
+
+ ## If that didn't work, we'd better clear the thing and start again.
+ ## But only if we know how to initialize it.
+ if v != me.VERSION:
+
+ ## Explain the situation.
+ moan("cache version %s not understood" % v)
+ if hash is None:
+ if h is None:
+ die("can't initialize cache: no hash function set")
+ else:
+ hash = h
+ try:
+ H.new(hash)
+ except Exception:
+ die("unknown hash function `%s'" % hash)
+
+ ## Drop old things.
+ c.execute('SELECT type, name FROM sqlite_master')
+ for type, name in c.fetchall():
+ c.execute('DROP %s IF EXISTS %s' % (type, name))
+
+ ## Now we're ready to go.
+ for stmt in me.INIT:
+ c.execute(stmt)
+ c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
+ db.commit()
+
+ ## Check the hash function if necessary.
+ if hash is None:
+ hash = h
+ elif h is not None and h != hash:
+ die("hash mismatch: cache uses %s but %s requested" % (h, hash))
+
+ ## All done.
+ me.hash = hash
+ me._db = db
+ me._pend = 0
+
+ def hashfile(me, fi):
+
+ ## If this isn't a proper file then don't try to hash it.
+ if fi.err or not ST.S_ISREG(fi.st.st_mode):
+ return None
+
+ ## See whether there's a valid entry in the cache.
+ if me._db:
+ c = me._db.cursor()
+ c.execute(
+ 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
+ [fi.st.st_ino])
+ r = c.fetchone()
+ if r is not None:
+ mt, sz, h, s = r
+ if mt == fi.st.st_mtime and \
+ sz == fi.st.st_size:
+ if not s:
+ c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
+ [fi.st.st_ino])
+ me._update()
+ return h
+
+ ## Hash the file. Beware raciness: update the file information from the
+ ## open descriptor, but set the size from what we actually read.
+ h = H.new(me.hash)
+ try:
+ with open(fi.name, 'rb') as f:
+ sz = 0
+ while True:
+ buf = f.read(me.BUFSZ)
+ if len(buf) == 0:
+ break
+ sz += len(buf)
+ h.update(buf)
+ fi.st = OS.fstat(f.fileno())
+ ##fi.st.st_size = sz
+ hash = h.digest()
+ except (OSError, IOError), err:
+ fi.st = None
+ fi.err = err
+ return None
+ hash = hash.encode('hex')
+
+ ## Insert a record into the database.
+ if me._db:
+ c.execute("""
+ INSERT OR REPLACE INTO hash
+ (ino, mtime, ctime, size, hash, seen)
+ VALUES
+ (?, ?, ?, ?, ?, 1);
+ """, [fi.st.st_ino,
+ fi.st.st_mtime,
+ fi.st.st_ctime,
+ fi.st.st_size,
+ hash])
+ me._update()
+
+ ## Done.
+ return hash
+
+ def _update(me):
+ me._pend += 1
+ if me._pend >= 1024:
+ me.flush()
+
+ def flush(me):
+ if me._db:
+ me._db.commit()
+ me._pend = 0
+
+ def need_db(me):
+ if not me._db:
+ die("no cache database")
+
+ def reset(me):
+ me.need_db()
+ c = me._db.cursor()
+ c.execute('UPDATE hash SET seen = 0 WHERE seen')
+ me.flush()
+
+ def prune(me):
+ me.need_db()
+ c = me._db.cursor()
+ c.execute('DELETE FROM hash WHERE NOT seen')
+ me.flush()
+
+###--------------------------------------------------------------------------
+### Printing output.
+
+class GenericFormatter (object):
+ def __init__(me, fi):
+ me.fi = fi
+ def _fmt_time(me, t):
+ tm = T.gmtime(t)
+ return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
+ def _enc_name(me, n):
+ return ' \\-> '.join(n.encode('string_escape').split(' -> '))
+ def name(me):
+ return me._enc_name(me.fi.name)
+ def info(me):
+ return me.TYPE
+ def mode(me):
+ return '%06o' % me.fi.st.st_mode
+ def size(me):
+ return me.fi.st.st_size
+ def mtime(me):
+ return me._fmt_time(me.fi.st.st_mtime)
+ def owner(me):
+ return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
+
+class ErrorFormatter (GenericFormatter):
+ def info(me):
+ return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
+ def error(me): return 'error'
+ mode = size = mtime = owner = error
+
+class SocketFormatter (GenericFormatter):
+ TYPE = 'socket'
+class PipeFormatter (GenericFormatter):
+ TYPE = 'fifo'
+
+class LinkFormatter (GenericFormatter):
+ TYPE = 'symbolic-link'
+ def name(me):
+ n = GenericFormatter.name(me)
+ try:
+ d = OS.readlink(me.fi.name)
+ return '%s -> %s' % (n, me._enc_name(d))
+ except OSError, err:
+ return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
+
+class DirectoryFormatter (GenericFormatter):
+ TYPE = 'directory'
+ def name(me): return GenericFormatter.name(me) + '/'
+ def size(me): return 'dir'
+
+class DeviceFormatter (GenericFormatter):
+ def info(me):
+ return '%s %d:%d' % (me.TYPE,
+ OS.major(me.fi.st.st_rdev),
+ OS.minor(me.fi.st.st_rdev))
+class BlockDeviceFormatter (DeviceFormatter):
+ TYPE = 'block-device'
+class CharDeviceFormatter (DeviceFormatter):
+ TYPE = 'character-device'
+
+class FileFormatter (GenericFormatter):
+ TYPE = 'regular-file'
+
+class Reporter (object):
+
+ TYMAP = {
+ ST.S_IFSOCK: SocketFormatter,
+ ST.S_IFDIR: DirectoryFormatter,
+ ST.S_IFLNK: LinkFormatter,
+ ST.S_IFREG: FileFormatter,
+ ST.S_IFBLK: BlockDeviceFormatter,
+ ST.S_IFCHR: CharDeviceFormatter,
+ ST.S_IFIFO: PipeFormatter,
+ }
+
+ def __init__(me, db):
+ me._inomap = {}
+ me._vinomap = {}
+ me._db = db
+ me._hsz = int(H.new(db.hash).digest_size)
+
+ def file(me, fi):
+ h = me._db.hashfile(fi)
+ if fi.err:
+ fmt = ErrorFormatter(fi)
+ vino = 'error'
+ else:
+ fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
+ inoidx = fi.st.st_dev, fi.st.st_ino
+ try:
+ vino = me._inomap[inoidx]
+ except KeyError:
+ suffix = ''
+ seq = 0
+ while True:
+ vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
+ if vino not in me._vinomap: break
+ suffix = '\0%d' % seq
+ seq += 1
+ me._inomap[inoidx] = vino
+ if h: info = h
+ else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
+ print '%s %8s %6s %-12s %-20s %20s %s' % (
+ info, vino, fmt.mode(), fmt.owner(),
+ fmt.mtime(), fmt.size(), fmt.name())
+
+###--------------------------------------------------------------------------
+### Main program.
+
+FMTMAP = {
+ 'rsync': lambda f: enum_rsync(stdin, f),
+ 'find0': lambda f: enum_find0(stdin, f)
+}
+op = OP.OptionParser(
+ usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
+ version = '%%prog, version %s' % VERSION,
+ description = '''\
+Print a digest of a filesystem (or a collection of specified files) to
+standard output. The idea is that the digest should be mostly /complete/
+(i.e., any `interesting\' change to the filesystem results in a different
+digest) and /canonical/ (i.e., identical filesystem contents result in
+identical output).
+''')
+
+for short, long, props in [
+ ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
+ 'help': 'clear cache of all files not seen' }),
+ ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
+ 'help': 'use FILE as a cache for file hashes' }),
+ ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
+ 'type': 'choice', 'choices': FMTMAP.keys(),
+ 'help': 'read files to report in the given FORMAT' }),
+ ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
+ ##'type': 'choice', 'choices': H.algorithms,
+ 'help': 'use HASH as the hash function' })]:
+ op.add_option(short, long, **props)
+opts, args = op.parse_args(argv)
+
+if not opts.files and len(args) <= 1:
+ die("no filename sources: nothing to do")
+db = HashCache(opts.cache, opts.hash)
+if opts.all:
+ db.reset()
+rep = Reporter(db)
+if opts.files:
+ FMTMAP[opts.files](rep.file)
+for dir in args[1:]:
+ enum_walk(dir, rep.file)
+if opts.all:
+ db.prune()
+db.flush()
+
+###----- That's all, folks --------------------------------------------------