roles/common/: Do the rest of the common configuration.
[distorted-ansible] / roles / common / files / backup / fshash
CommitLineData
5fc6de27
MW
1#! /usr/bin/python
2###
3### Efficiently construct canonical digests of filesystems
4###
5### (c) 2012 Mark Wooding
6###
7
8###----- Licensing notice ---------------------------------------------------
9###
10### This file is part of the `rsync-backup' program.
11###
12### rsync-backup is free software; you can redistribute it and/or modify
13### it under the terms of the GNU General Public License as published by
14### the Free Software Foundation; either version 2 of the License, or
15### (at your option) any later version.
16###
17### rsync-backup is distributed in the hope that it will be useful,
18### but WITHOUT ANY WARRANTY; without even the implied warranty of
19### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20### GNU General Public License for more details.
21###
22### You should have received a copy of the GNU General Public License
23### along with rsync-backup; if not, write to the Free Software Foundation,
24### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26from sys import argv, exit, stdin, stdout, stderr
27import os as OS
28import re as RX
29import time as T
30import stat as ST
31import optparse as OP
32import hashlib as H
33import sqlite3 as DB
34import zlib as Z
35
36PACKAGE = 'rsync-backup'
37VERSION = '0.99.1-8-ga844'
38
39###--------------------------------------------------------------------------
40### Utilities.
41
42QUIS = OS.path.basename(argv[0])
43
44def moan(msg):
45 stderr.write('%s: %s\n' % (QUIS, msg))
46
47def die(msg, rc = 1):
48 moan(msg)
49 exit(rc)
50
51SYSERR = 0
52def syserr(msg):
53 global SYSERR
54 moan(msg)
55 SYSERR += 1
56
57###--------------------------------------------------------------------------
58### File system enumeration.
59
60class FileInfo (object):
61 def __init__(me, file, st = None):
62 me.name = file
63 if st:
64 me.st = st
65 me.err = None
66 else:
67 try:
68 me.st = OS.lstat(file)
69 me.err = None
70 except OSError, err:
71 me.st = None
72 me.err = err
73
74def enum_walk(file, func):
75
76 def dirents(name):
77 try:
78 return OS.listdir(name)
79 except OSError, err:
80 syserr("failed to read directory `%s': %s" % (name, err.strerror))
81 return []
82
83 def dir(ee, dev):
84 ff = []
85 dd = []
86 for e in ee:
87 fi = FileInfo(e)
88 if fi.st and fi.st.st_dev != dev: pass
89 if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
90 else: ff.append(fi)
91 ff.sort(key = lambda fi: fi.name)
92 dd.sort(key = lambda fi: fi.name + '/')
93 for f in ff:
94 func(f)
95 for d in dd:
96 if d.st.st_dev == dev:
97 func(d)
98 dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
99
100 if file.endswith('/'):
101 cwd = OS.open('.', OS.O_RDONLY)
102 try:
103 OS.chdir(file)
104 fi = FileInfo('.')
105 func(fi)
106 dir(dirents('.'), fi.st.st_dev)
107 finally:
108 OS.fchdir(cwd)
109 OS.close(cwd)
110 else:
111 fi = FileInfo(file)
112 func(fi)
113 if fi.st and ST.S_ISDIR(fi.st.st_mode):
114 dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
115 fi.st.st_dev)
116
117def enum_find0(f, func):
118 tail = ""
119 while True:
120 buf = f.read(8192)
121 last = len(buf) == 0
122 names = (tail + buf).split('\0')
123 tail = names.pop()
124 for n in names:
125 func(FileInfo(n))
126 if last:
127 break
128 if len(tail):
129 moan("ignored trailing junk after last filename")
130
131RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
132def enum_rsync(f, func):
133
134 ## The format is a little fiddly. Each line consists of PERMS SIZE DATE
135 ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
136 ## one space character after the TIME and may begin with a space.
137 ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
138 ## for a byte with that value. Newlines and backslashes which would be
139 ## ambiguous are converted into this form; all other characters are
140 ## literal.
141 ##
142 ## We ignore the stat information and retrieve it ourselves, because it's
143 ## incomplete. Hopefully the dcache is still warm.
144
145 for line in f:
146 if line.endswith('\n'): line = line[:-1]
147
148 ## Extract the escaped name.
149 ff = line.split(None, 3)
150 if len(ff) != 4:
151 syserr("ignoring invalid line from rsync: `%s'" % line)
152 continue
153 tail = ff[3]
154 try:
155 spc = tail.index(' ')
156 except ValueError:
157 syserr("ignoring invalid line from rsync: `%s'" % line)
158 continue
159 name = tail[spc + 1:]
160
161 ## Now translate escape sequences.
162 name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
163
164 ## Call the client.
165 try:
166 fi = FileInfo(name)
167 except OSError, err:
168 syserr("failed to stat `%s': %s" % (name, err.strerror))
169 continue
170 func(fi)
171
172###--------------------------------------------------------------------------
173### The hash cache.
174
175class HashCache (object):
176
177 VERSION = 0
178 BUFSZ = 128*1024
179
180 INIT = [
181 """CREATE TABLE meta (
182 version INTEGER NOT NULL,
183 hash TEXT NOT NULL
184 );""",
185 """CREATE TABLE hash (
186 ino INTEGER PRIMARY KEY,
187 mtime INTEGER NOT NULL,
188 ctime INTEGER NOT NULL,
189 size INTEGER NOT NULL,
190 hash TEXT NOT NULL,
191 seen BOOLEAN NOT NULL DEFAULT TRUE
192 );""",
193 """PRAGMA journal_mode = WAL;"""
194 ]
195
196 def __init__(me, file, hash = None):
197
198 if file is None:
199
200 ## We're going this alone, with no cache.
201 db = None
202 if hash is None:
203 die("no hash specified and no database cache to read from")
204 else:
205
206 ## Connect to the database.
207 db = DB.connect(file)
208 db.text_factory = str
209
210 ## See whether we can understand the cache database.
211 c = db.cursor()
212 v = h = None
213 try:
214 c.execute('SELECT version, hash FROM meta')
215 v, h = c.fetchone()
216 if c.fetchone() is not None:
217 die("cache database corrupt: meta table has mutliple rows")
218 except (DB.Error, TypeError):
219 pass
220
221 ## If that didn't work, we'd better clear the thing and start again.
222 ## But only if we know how to initialize it.
223 if v != me.VERSION:
224
225 ## Explain the situation.
226 moan("cache version %s not understood" % v)
227 if hash is None:
228 if h is None:
229 die("can't initialize cache: no hash function set")
230 else:
231 hash = h
232 try:
233 H.new(hash)
234 except Exception:
235 die("unknown hash function `%s'" % hash)
236
237 ## Drop old things.
238 c.execute('SELECT type, name FROM sqlite_master')
239 for type, name in c.fetchall():
240 c.execute('DROP %s IF EXISTS %s' % (type, name))
241
242 ## Now we're ready to go.
243 for stmt in me.INIT:
244 c.execute(stmt)
245 c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
246 db.commit()
247
248 ## Check the hash function if necessary.
249 if hash is None:
250 hash = h
251 elif h is not None and h != hash:
252 die("hash mismatch: cache uses %s but %s requested" % (h, hash))
253
254 ## All done.
255 me.hash = hash
256 me._db = db
257 me._pend = 0
258
259 def hashfile(me, fi):
260
261 ## If this isn't a proper file then don't try to hash it.
262 if fi.err or not ST.S_ISREG(fi.st.st_mode):
263 return None
264
265 ## See whether there's a valid entry in the cache.
266 if me._db:
267 c = me._db.cursor()
268 c.execute(
269 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
270 [fi.st.st_ino])
271 r = c.fetchone()
272 if r is not None:
273 mt, sz, h, s = r
274 if mt == fi.st.st_mtime and \
275 sz == fi.st.st_size:
276 if not s:
277 c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
278 [fi.st.st_ino])
279 me._update()
280 return h
281
282 ## Hash the file. Beware raciness: update the file information from the
283 ## open descriptor, but set the size from what we actually read.
284 h = H.new(me.hash)
285 try:
286 with open(fi.name, 'rb') as f:
287 sz = 0
288 while True:
289 buf = f.read(me.BUFSZ)
290 if len(buf) == 0:
291 break
292 sz += len(buf)
293 h.update(buf)
294 fi.st = OS.fstat(f.fileno())
295 ##fi.st.st_size = sz
296 hash = h.digest()
297 except (OSError, IOError), err:
298 fi.st = None
299 fi.err = err
300 return None
301 hash = hash.encode('hex')
302
303 ## Insert a record into the database.
304 if me._db:
305 c.execute("""
306 INSERT OR REPLACE INTO hash
307 (ino, mtime, ctime, size, hash, seen)
308 VALUES
309 (?, ?, ?, ?, ?, 1);
310 """, [fi.st.st_ino,
311 fi.st.st_mtime,
312 fi.st.st_ctime,
313 fi.st.st_size,
314 hash])
315 me._update()
316
317 ## Done.
318 return hash
319
320 def _update(me):
321 me._pend += 1
322 if me._pend >= 1024:
323 me.flush()
324
325 def flush(me):
326 if me._db:
327 me._db.commit()
328 me._pend = 0
329
330 def need_db(me):
331 if not me._db:
332 die("no cache database")
333
334 def reset(me):
335 me.need_db()
336 c = me._db.cursor()
337 c.execute('UPDATE hash SET seen = 0 WHERE seen')
338 me.flush()
339
340 def prune(me):
341 me.need_db()
342 c = me._db.cursor()
343 c.execute('DELETE FROM hash WHERE NOT seen')
344 me.flush()
345
346###--------------------------------------------------------------------------
347### Printing output.
348
349class GenericFormatter (object):
350 def __init__(me, fi):
351 me.fi = fi
352 def _fmt_time(me, t):
353 tm = T.gmtime(t)
354 return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
355 def _enc_name(me, n):
356 return ' \\-> '.join(n.encode('string_escape').split(' -> '))
357 def name(me):
358 return me._enc_name(me.fi.name)
359 def info(me):
360 return me.TYPE
361 def mode(me):
362 return '%06o' % me.fi.st.st_mode
363 def size(me):
364 return me.fi.st.st_size
365 def mtime(me):
366 return me._fmt_time(me.fi.st.st_mtime)
367 def owner(me):
368 return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
369
370class ErrorFormatter (GenericFormatter):
371 def info(me):
372 return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
373 def error(me): return 'error'
374 mode = size = mtime = owner = error
375
376class SocketFormatter (GenericFormatter):
377 TYPE = 'socket'
378class PipeFormatter (GenericFormatter):
379 TYPE = 'fifo'
380
381class LinkFormatter (GenericFormatter):
382 TYPE = 'symbolic-link'
383 def name(me):
384 n = GenericFormatter.name(me)
385 try:
386 d = OS.readlink(me.fi.name)
387 return '%s -> %s' % (n, me._enc_name(d))
388 except OSError, err:
389 return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
390
391class DirectoryFormatter (GenericFormatter):
392 TYPE = 'directory'
393 def name(me): return GenericFormatter.name(me) + '/'
394 def size(me): return 'dir'
395
396class DeviceFormatter (GenericFormatter):
397 def info(me):
398 return '%s %d:%d' % (me.TYPE,
399 OS.major(me.fi.st.st_rdev),
400 OS.minor(me.fi.st.st_rdev))
401class BlockDeviceFormatter (DeviceFormatter):
402 TYPE = 'block-device'
403class CharDeviceFormatter (DeviceFormatter):
404 TYPE = 'character-device'
405
406class FileFormatter (GenericFormatter):
407 TYPE = 'regular-file'
408
409class Reporter (object):
410
411 TYMAP = {
412 ST.S_IFSOCK: SocketFormatter,
413 ST.S_IFDIR: DirectoryFormatter,
414 ST.S_IFLNK: LinkFormatter,
415 ST.S_IFREG: FileFormatter,
416 ST.S_IFBLK: BlockDeviceFormatter,
417 ST.S_IFCHR: CharDeviceFormatter,
418 ST.S_IFIFO: PipeFormatter,
419 }
420
421 def __init__(me, db):
422 me._inomap = {}
423 me._vinomap = {}
424 me._db = db
425 me._hsz = int(H.new(db.hash).digest_size)
426
427 def file(me, fi):
428 h = me._db.hashfile(fi)
429 if fi.err:
430 fmt = ErrorFormatter(fi)
431 vino = 'error'
432 else:
433 fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
434 inoidx = fi.st.st_dev, fi.st.st_ino
435 try:
436 vino = me._inomap[inoidx]
437 except KeyError:
438 suffix = ''
439 seq = 0
440 while True:
441 vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
442 if vino not in me._vinomap: break
443 suffix = '\0%d' % seq
444 seq += 1
445 me._inomap[inoidx] = vino
446 if h: info = h
447 else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
448 print '%s %8s %6s %-12s %-20s %20s %s' % (
449 info, vino, fmt.mode(), fmt.owner(),
450 fmt.mtime(), fmt.size(), fmt.name())
451
452###--------------------------------------------------------------------------
453### Main program.
454
455FMTMAP = {
456 'rsync': lambda f: enum_rsync(stdin, f),
457 'find0': lambda f: enum_find0(stdin, f)
458}
459op = OP.OptionParser(
460 usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
461 version = '%%prog, version %s' % VERSION,
462 description = '''\
463Print a digest of a filesystem (or a collection of specified files) to
464standard output. The idea is that the digest should be mostly /complete/
465(i.e., any `interesting\' change to the filesystem results in a different
466digest) and /canonical/ (i.e., identical filesystem contents result in
467identical output).
468''')
469
470for short, long, props in [
471 ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
472 'help': 'clear cache of all files not seen' }),
473 ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
474 'help': 'use FILE as a cache for file hashes' }),
475 ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
476 'type': 'choice', 'choices': FMTMAP.keys(),
477 'help': 'read files to report in the given FORMAT' }),
478 ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
479 ##'type': 'choice', 'choices': H.algorithms,
480 'help': 'use HASH as the hash function' })]:
481 op.add_option(short, long, **props)
482opts, args = op.parse_args(argv)
483
484if not opts.files and len(args) <= 1:
485 die("no filename sources: nothing to do")
486db = HashCache(opts.cache, opts.hash)
487if opts.all:
488 db.reset()
489rep = Reporter(db)
490if opts.files:
491 FMTMAP[opts.files](rep.file)
492for dir in args[1:]:
493 enum_walk(dir, rep.file)
494if opts.all:
495 db.prune()
496db.flush()
497
498###----- That's all, folks --------------------------------------------------