X-Git-Url: https://git.distorted.org.uk/~mdw/distorted-ansible/blobdiff_plain/3f1ea36d39993a86f47ec5658455305d38f08f37..5fc6de272c4e1d6b41a8c24b6ff5116548ac12c5:/roles/common/files/backup/fshash diff --git a/roles/common/files/backup/fshash b/roles/common/files/backup/fshash new file mode 100755 index 0000000..888ef73 --- /dev/null +++ b/roles/common/files/backup/fshash @@ -0,0 +1,498 @@ +#! /usr/bin/python +### +### Efficiently construct canonical digests of filesystems +### +### (c) 2012 Mark Wooding +### + +###----- Licensing notice --------------------------------------------------- +### +### This file is part of the `rsync-backup' program. +### +### rsync-backup is free software; you can redistribute it and/or modify +### it under the terms of the GNU General Public License as published by +### the Free Software Foundation; either version 2 of the License, or +### (at your option) any later version. +### +### rsync-backup is distributed in the hope that it will be useful, +### but WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +### GNU General Public License for more details. +### +### You should have received a copy of the GNU General Public License +### along with rsync-backup; if not, write to the Free Software Foundation, +### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +from sys import argv, exit, stdin, stdout, stderr +import os as OS +import re as RX +import time as T +import stat as ST +import optparse as OP +import hashlib as H +import sqlite3 as DB +import zlib as Z + +PACKAGE = 'rsync-backup' +VERSION = '0.99.1-8-ga844' + +###-------------------------------------------------------------------------- +### Utilities. + +QUIS = OS.path.basename(argv[0]) + +def moan(msg): + stderr.write('%s: %s\n' % (QUIS, msg)) + +def die(msg, rc = 1): + moan(msg) + exit(rc) + +SYSERR = 0 +def syserr(msg): + global SYSERR + moan(msg) + SYSERR += 1 + +###-------------------------------------------------------------------------- +### File system enumeration. + +class FileInfo (object): + def __init__(me, file, st = None): + me.name = file + if st: + me.st = st + me.err = None + else: + try: + me.st = OS.lstat(file) + me.err = None + except OSError, err: + me.st = None + me.err = err + +def enum_walk(file, func): + + def dirents(name): + try: + return OS.listdir(name) + except OSError, err: + syserr("failed to read directory `%s': %s" % (name, err.strerror)) + return [] + + def dir(ee, dev): + ff = [] + dd = [] + for e in ee: + fi = FileInfo(e) + if fi.st and fi.st.st_dev != dev: pass + if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi) + else: ff.append(fi) + ff.sort(key = lambda fi: fi.name) + dd.sort(key = lambda fi: fi.name + '/') + for f in ff: + func(f) + for d in dd: + if d.st.st_dev == dev: + func(d) + dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev) + + if file.endswith('/'): + cwd = OS.open('.', OS.O_RDONLY) + try: + OS.chdir(file) + fi = FileInfo('.') + func(fi) + dir(dirents('.'), fi.st.st_dev) + finally: + OS.fchdir(cwd) + OS.close(cwd) + else: + fi = FileInfo(file) + func(fi) + if fi.st and ST.S_ISDIR(fi.st.st_mode): + dir([OS.path.join(fi.name, e) for e in dirents(fi.name)], + fi.st.st_dev) + +def enum_find0(f, func): + tail = "" + while True: + buf = f.read(8192) + last = len(buf) == 0 + names = (tail + buf).split('\0') + tail = names.pop() + for n in names: + func(FileInfo(n)) + if last: + break + if len(tail): + moan("ignored trailing junk after last filename") + +RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE) +def enum_rsync(f, func): + + ## The format is a little fiddly. Each line consists of PERMS SIZE DATE + ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly + ## one space character after the TIME and may begin with a space. + ## Sequences of the form `\#OOO' where OOO are three octal digits, stand + ## for a byte with that value. Newlines and backslashes which would be + ## ambiguous are converted into this form; all other characters are + ## literal. + ## + ## We ignore the stat information and retrieve it ourselves, because it's + ## incomplete. Hopefully the dcache is still warm. + + for line in f: + if line.endswith('\n'): line = line[:-1] + + ## Extract the escaped name. + ff = line.split(None, 3) + if len(ff) != 4: + syserr("ignoring invalid line from rsync: `%s'" % line) + continue + tail = ff[3] + try: + spc = tail.index(' ') + except ValueError: + syserr("ignoring invalid line from rsync: `%s'" % line) + continue + name = tail[spc + 1:] + + ## Now translate escape sequences. + name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name) + + ## Call the client. + try: + fi = FileInfo(name) + except OSError, err: + syserr("failed to stat `%s': %s" % (name, err.strerror)) + continue + func(fi) + +###-------------------------------------------------------------------------- +### The hash cache. + +class HashCache (object): + + VERSION = 0 + BUFSZ = 128*1024 + + INIT = [ + """CREATE TABLE meta ( + version INTEGER NOT NULL, + hash TEXT NOT NULL + );""", + """CREATE TABLE hash ( + ino INTEGER PRIMARY KEY, + mtime INTEGER NOT NULL, + ctime INTEGER NOT NULL, + size INTEGER NOT NULL, + hash TEXT NOT NULL, + seen BOOLEAN NOT NULL DEFAULT TRUE + );""", + """PRAGMA journal_mode = WAL;""" + ] + + def __init__(me, file, hash = None): + + if file is None: + + ## We're going this alone, with no cache. + db = None + if hash is None: + die("no hash specified and no database cache to read from") + else: + + ## Connect to the database. + db = DB.connect(file) + db.text_factory = str + + ## See whether we can understand the cache database. + c = db.cursor() + v = h = None + try: + c.execute('SELECT version, hash FROM meta') + v, h = c.fetchone() + if c.fetchone() is not None: + die("cache database corrupt: meta table has mutliple rows") + except (DB.Error, TypeError): + pass + + ## If that didn't work, we'd better clear the thing and start again. + ## But only if we know how to initialize it. + if v != me.VERSION: + + ## Explain the situation. + moan("cache version %s not understood" % v) + if hash is None: + if h is None: + die("can't initialize cache: no hash function set") + else: + hash = h + try: + H.new(hash) + except Exception: + die("unknown hash function `%s'" % hash) + + ## Drop old things. + c.execute('SELECT type, name FROM sqlite_master') + for type, name in c.fetchall(): + c.execute('DROP %s IF EXISTS %s' % (type, name)) + + ## Now we're ready to go. + for stmt in me.INIT: + c.execute(stmt) + c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash]) + db.commit() + + ## Check the hash function if necessary. + if hash is None: + hash = h + elif h is not None and h != hash: + die("hash mismatch: cache uses %s but %s requested" % (h, hash)) + + ## All done. + me.hash = hash + me._db = db + me._pend = 0 + + def hashfile(me, fi): + + ## If this isn't a proper file then don't try to hash it. + if fi.err or not ST.S_ISREG(fi.st.st_mode): + return None + + ## See whether there's a valid entry in the cache. + if me._db: + c = me._db.cursor() + c.execute( + 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;', + [fi.st.st_ino]) + r = c.fetchone() + if r is not None: + mt, sz, h, s = r + if mt == fi.st.st_mtime and \ + sz == fi.st.st_size: + if not s: + c.execute('UPDATE hash SET seen = 1 WHERE ino = ?', + [fi.st.st_ino]) + me._update() + return h + + ## Hash the file. Beware raciness: update the file information from the + ## open descriptor, but set the size from what we actually read. + h = H.new(me.hash) + try: + with open(fi.name, 'rb') as f: + sz = 0 + while True: + buf = f.read(me.BUFSZ) + if len(buf) == 0: + break + sz += len(buf) + h.update(buf) + fi.st = OS.fstat(f.fileno()) + ##fi.st.st_size = sz + hash = h.digest() + except (OSError, IOError), err: + fi.st = None + fi.err = err + return None + hash = hash.encode('hex') + + ## Insert a record into the database. + if me._db: + c.execute(""" + INSERT OR REPLACE INTO hash + (ino, mtime, ctime, size, hash, seen) + VALUES + (?, ?, ?, ?, ?, 1); + """, [fi.st.st_ino, + fi.st.st_mtime, + fi.st.st_ctime, + fi.st.st_size, + hash]) + me._update() + + ## Done. + return hash + + def _update(me): + me._pend += 1 + if me._pend >= 1024: + me.flush() + + def flush(me): + if me._db: + me._db.commit() + me._pend = 0 + + def need_db(me): + if not me._db: + die("no cache database") + + def reset(me): + me.need_db() + c = me._db.cursor() + c.execute('UPDATE hash SET seen = 0 WHERE seen') + me.flush() + + def prune(me): + me.need_db() + c = me._db.cursor() + c.execute('DELETE FROM hash WHERE NOT seen') + me.flush() + +###-------------------------------------------------------------------------- +### Printing output. + +class GenericFormatter (object): + def __init__(me, fi): + me.fi = fi + def _fmt_time(me, t): + tm = T.gmtime(t) + return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm) + def _enc_name(me, n): + return ' \\-> '.join(n.encode('string_escape').split(' -> ')) + def name(me): + return me._enc_name(me.fi.name) + def info(me): + return me.TYPE + def mode(me): + return '%06o' % me.fi.st.st_mode + def size(me): + return me.fi.st.st_size + def mtime(me): + return me._fmt_time(me.fi.st.st_mtime) + def owner(me): + return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid) + +class ErrorFormatter (GenericFormatter): + def info(me): + return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror) + def error(me): return 'error' + mode = size = mtime = owner = error + +class SocketFormatter (GenericFormatter): + TYPE = 'socket' +class PipeFormatter (GenericFormatter): + TYPE = 'fifo' + +class LinkFormatter (GenericFormatter): + TYPE = 'symbolic-link' + def name(me): + n = GenericFormatter.name(me) + try: + d = OS.readlink(me.fi.name) + return '%s -> %s' % (n, me._enc_name(d)) + except OSError, err: + return '%s -> ' % (n, err.errno, err.strerror) + +class DirectoryFormatter (GenericFormatter): + TYPE = 'directory' + def name(me): return GenericFormatter.name(me) + '/' + def size(me): return 'dir' + +class DeviceFormatter (GenericFormatter): + def info(me): + return '%s %d:%d' % (me.TYPE, + OS.major(me.fi.st.st_rdev), + OS.minor(me.fi.st.st_rdev)) +class BlockDeviceFormatter (DeviceFormatter): + TYPE = 'block-device' +class CharDeviceFormatter (DeviceFormatter): + TYPE = 'character-device' + +class FileFormatter (GenericFormatter): + TYPE = 'regular-file' + +class Reporter (object): + + TYMAP = { + ST.S_IFSOCK: SocketFormatter, + ST.S_IFDIR: DirectoryFormatter, + ST.S_IFLNK: LinkFormatter, + ST.S_IFREG: FileFormatter, + ST.S_IFBLK: BlockDeviceFormatter, + ST.S_IFCHR: CharDeviceFormatter, + ST.S_IFIFO: PipeFormatter, + } + + def __init__(me, db): + me._inomap = {} + me._vinomap = {} + me._db = db + me._hsz = int(H.new(db.hash).digest_size) + + def file(me, fi): + h = me._db.hashfile(fi) + if fi.err: + fmt = ErrorFormatter(fi) + vino = 'error' + else: + fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi) + inoidx = fi.st.st_dev, fi.st.st_ino + try: + vino = me._inomap[inoidx] + except KeyError: + suffix = '' + seq = 0 + while True: + vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff) + if vino not in me._vinomap: break + suffix = '\0%d' % seq + seq += 1 + me._inomap[inoidx] = vino + if h: info = h + else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info()) + print '%s %8s %6s %-12s %-20s %20s %s' % ( + info, vino, fmt.mode(), fmt.owner(), + fmt.mtime(), fmt.size(), fmt.name()) + +###-------------------------------------------------------------------------- +### Main program. + +FMTMAP = { + 'rsync': lambda f: enum_rsync(stdin, f), + 'find0': lambda f: enum_find0(stdin, f) +} +op = OP.OptionParser( + usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]', + version = '%%prog, version %s' % VERSION, + description = '''\ +Print a digest of a filesystem (or a collection of specified files) to +standard output. The idea is that the digest should be mostly /complete/ +(i.e., any `interesting\' change to the filesystem results in a different +digest) and /canonical/ (i.e., identical filesystem contents result in +identical output). +''') + +for short, long, props in [ + ('-a', '--all', { 'action': 'store_true', 'dest': 'all', + 'help': 'clear cache of all files not seen' }), + ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE', + 'help': 'use FILE as a cache for file hashes' }), + ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT', + 'type': 'choice', 'choices': FMTMAP.keys(), + 'help': 'read files to report in the given FORMAT' }), + ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH', + ##'type': 'choice', 'choices': H.algorithms, + 'help': 'use HASH as the hash function' })]: + op.add_option(short, long, **props) +opts, args = op.parse_args(argv) + +if not opts.files and len(args) <= 1: + die("no filename sources: nothing to do") +db = HashCache(opts.cache, opts.hash) +if opts.all: + db.reset() +rep = Reporter(db) +if opts.files: + FMTMAP[opts.files](rep.file) +for dir in args[1:]: + enum_walk(dir, rep.file) +if opts.all: + db.prune() +db.flush() + +###----- That's all, folks --------------------------------------------------