fshash.in: Remove ambiguity for filenames containing ` -> '.
[rsync-backup] / fshash.in
CommitLineData
f6b4ffdc
MW
1#! @PYTHON@
2###
3### Efficiently construct canonical digests of filesystems
4###
5### (c) 2012 Mark Wooding
6###
7
8###----- Licensing notice ---------------------------------------------------
9###
10### This file is part of the `rsync-backup' program.
11###
12### rsync-backup is free software; you can redistribute it and/or modify
13### it under the terms of the GNU General Public License as published by
14### the Free Software Foundation; either version 2 of the License, or
15### (at your option) any later version.
16###
17### rsync-backup is distributed in the hope that it will be useful,
18### but WITHOUT ANY WARRANTY; without even the implied warranty of
19### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20### GNU General Public License for more details.
21###
22### You should have received a copy of the GNU General Public License
23### along with rsync-backup; if not, write to the Free Software Foundation,
24### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26from sys import argv, exit, stdin, stdout, stderr
27import os as OS
28import re as RX
29import time as T
30import stat as ST
31import optparse as OP
32import hashlib as H
33import sqlite3 as DB
34import zlib as Z
35
36PACKAGE = '@PACKAGE@'
37VERSION = '@VERSION@'
38
39###--------------------------------------------------------------------------
40### Utilities.
41
42QUIS = OS.path.basename(argv[0])
43
44def moan(msg):
45 stderr.write('%s: %s\n' % (QUIS, msg))
46
47def die(msg, rc = 1):
48 moan(msg)
49 exit(rc)
50
51SYSERR = 0
52def syserr(msg):
53 global SYSERR
54 moan(msg)
55 SYSERR += 1
56
57###--------------------------------------------------------------------------
58### File system enumeration.
59
60class FileInfo (object):
61 def __init__(me, file, st = None):
62 me.name = file
63 if st:
64 me.st = st
65 me.err = None
66 else:
67 try:
68 me.st = OS.lstat(file)
69 me.err = None
70 except OSError, err:
71 me.st = None
72 me.err = err
73
74def enum_walk(file, func):
75
76 def dirents(name):
77 try:
78 return OS.listdir(name)
79 except OSError, err:
80 syserr("failed to read directory `%s': %s" % (name, err.strerror))
81 return []
82
83 def dir(ee, dev):
84 ff = []
85 dd = []
86 for e in ee:
87 fi = FileInfo(e)
88 if fi.st and fi.st.st_dev != dev: pass
89 if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
90 else: ff.append(fi)
91 ff.sort(key = lambda fi: fi.name)
92 dd.sort(key = lambda fi: fi.name + '/')
93 for f in ff:
94 func(f)
95 for d in dd:
96 if d.st.st_dev == dev:
97 func(d)
98 dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
99
100 if file.endswith('/'):
101 OS.chdir(file)
102 fi = FileInfo('.')
103 func(fi)
104 dir(dirents('.'), fi.st.st_dev)
105 else:
106 fi = FileInfo(file)
107 func(fi)
108 if fi.st and ST.S_ISDIR(fi.st.st_mode):
109 dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
110 fi.st.st_dev)
111
112def enum_find0(f, func):
113 tail = ""
114 while True:
115 buf = f.read(8192)
116 last = len(buf) == 0
117 names = (tail + buf).split('\0')
118 tail = names.pop()
119 for n in names:
120 func(FileInfo(n))
121 if last:
122 break
123 if len(tail):
124 moan("ignored trailing junk after last filename")
125
126RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
127def enum_rsync(f, func):
128
129 ## The format is a little fiddly. Each line consists of PERMS SIZE DATE
130 ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
131 ## one space character after the TIME and may begin with a space.
132 ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
133 ## for a byte with that value. Newlines and backslashes which would be
134 ## ambiguous are converted into this form; all other characters are
135 ## literal.
136 ##
137 ## We ignore the stat information and retrieve it ourselves, because it's
138 ## incomplete. Hopefully the dcache is still warm.
139
140 for line in f:
141 if line.endswith('\n'): line = line[:-1]
142
143 ## Extract the escaped name.
144 ff = line.split(None, 3)
145 if len(ff) != 4:
146 syserr("ignoring invalid line from rsync: `%s'" % line)
147 continue
148 tail = ff[3]
149 try:
150 spc = tail.index(' ')
151 except ValueError:
152 syserr("ignoring invalid line from rsync: `%s'" % line)
153 continue
154 name = tail[spc + 1:]
155
156 ## Now translate escape sequences.
157 name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
158
159 ## Call the client.
160 try:
161 fi = FileInfo(name)
162 except OSError, err:
163 syserr("failed to stat `%s': %s" % (name, err.strerror))
164 continue
165 func(fi)
166
167###--------------------------------------------------------------------------
168### The hash cache.
169
170class HashCache (object):
171
172 VERSION = 0
173 BUFSZ = 128*1024
174
175 INIT = [
176 """CREATE TABLE meta (
177 version INTEGER NOT NULL,
178 hash TEXT NOT NULL
179 );""",
180 """CREATE TABLE hash (
181 ino INTEGER PRIMARY KEY,
182 mtime INTEGER NOT NULL,
183 ctime INTEGER NOT NULL,
184 size INTEGER NOT NULL,
185 hash TEXT NOT NULL,
186 seen BOOLEAN NOT NULL DEFAULT TRUE
187 );""",
188 """PRAGMA journal_mode = WAL;"""
189 ]
190
191 def __init__(me, file, hash = None):
192
193 if file is None:
194
195 ## We're going this alone, with no cache.
196 db = None
197 if hash is None:
198 die("no hash specified and no database cache to read from")
199 else:
200
201 ## Connect to the database.
202 db = DB.connect(file)
203 db.text_factory = str
204
205 ## See whether we can understand the cache database.
206 c = db.cursor()
207 v = h = None
208 try:
209 c.execute('SELECT version, hash FROM meta')
210 v, h = c.fetchone()
211 if c.fetchone() is not None:
212 die("cache database corrupt: meta table has mutliple rows")
213 except (DB.Error, TypeError):
214 pass
215
216 ## If that didn't work, we'd better clear the thing and start again.
217 ## But only if we know how to initialize it.
218 if v != me.VERSION:
219
220 ## Explain the situation.
221 moan("cache version %s not understood" % v)
222 if hash is None:
223 if h is None:
224 die("can't initialize cache: no hash function set")
225 else:
226 hash = h
227 try:
228 H.new(hash)
229 except Exception:
230 die("unknown hash function `%s'" % hash)
231
232 ## Drop old things.
233 c.execute('SELECT type, name FROM sqlite_master')
234 for type, name in c.fetchall():
235 c.execute('DROP %s IF EXISTS %s' % (type, name))
236
237 ## Now we're ready to go.
238 for stmt in me.INIT:
239 c.execute(stmt)
240 c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
241 db.commit()
242
243 ## Check the hash function if necessary.
244 if hash is None:
245 hash = h
246 elif h is not None and h != hash:
247 die("hash mismatch: cache uses %s but %s requested" % (h, hash))
248
249 ## All done.
250 me.hash = hash
251 me._db = db
252 me._pend = 0
253
254 def hashfile(me, fi):
255
256 ## If this isn't a proper file then don't try to hash it.
257 if fi.err or not ST.S_ISREG(fi.st.st_mode):
258 return None
259
260 ## See whether there's a valid entry in the cache.
261 if me._db:
262 c = me._db.cursor()
263 c.execute(
264 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
265 [fi.st.st_ino])
266 r = c.fetchone()
267 if r is not None:
268 mt, sz, h, s = r
269 if mt == fi.st.st_mtime and \
270 sz == fi.st.st_size:
271 if not s:
272 c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
273 [fi.st.st_ino])
274 me._update()
275 return h
276
277 ## Hash the file. Beware raciness: update the file information from the
278 ## open descriptor, but set the size from what we actually read.
279 h = H.new(me.hash)
280 try:
281 with open(fi.name, 'rb') as f:
282 sz = 0
283 while True:
284 buf = f.read(me.BUFSZ)
285 if len(buf) == 0:
286 break
287 sz += len(buf)
288 h.update(buf)
289 fi.st = OS.fstat(f.fileno())
290 ##fi.st.st_size = sz
291 hash = h.digest()
292 except (OSError, IOError), err:
293 fi.st = None
294 fi.err = err
295 return None
296 hash = hash.encode('hex')
297
298 ## Insert a record into the database.
299 if me._db:
300 c.execute("""
301 INSERT OR REPLACE INTO hash
302 (ino, mtime, ctime, size, hash, seen)
303 VALUES
304 (?, ?, ?, ?, ?, 1);
305 """, [fi.st.st_ino,
306 fi.st.st_mtime,
307 fi.st.st_ctime,
308 fi.st.st_size,
309 hash])
310 me._update()
311
312 ## Done.
313 return hash
314
315 def _update(me):
316 me._pend += 1
317 if me._pend >= 1024:
318 me.flush()
319
320 def flush(me):
321 if me._db:
322 me._db.commit()
323 me._pend = 0
324
325 def need_db(me):
326 if not me._db:
327 die("no cache database")
328
329 def reset(me):
330 me.need_db()
331 c = me._db.cursor()
332 c.execute('UPDATE hash SET seen = 0 WHERE seen')
333 me.flush()
334
335 def prune(me):
336 me.need_db()
337 c = me._db.cursor()
338 c.execute('DELETE FROM hash WHERE NOT seen')
339 me.flush()
340
341###--------------------------------------------------------------------------
342### Printing output.
343
344class GenericFormatter (object):
345 def __init__(me, fi):
346 me.fi = fi
347 def _fmt_time(me, t):
348 tm = T.gmtime(t)
349 return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
350 def _enc_name(me, n):
8aeb0c53 351 return ' \\-> '.join(n.encode('string_escape').split(' -> '))
f6b4ffdc
MW
352 def name(me):
353 return me._enc_name(me.fi.name)
354 def info(me):
355 return me.TYPE
356 def mode(me):
357 return '%06o' % me.fi.st.st_mode
358 def size(me):
359 return me.fi.st.st_size
360 def mtime(me):
361 return me._fmt_time(me.fi.st.st_mtime)
362 def owner(me):
363 return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
364
365class ErrorFormatter (GenericFormatter):
366 def info(me):
367 return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
368 def error(me): return 'error'
369 mode = size = mtime = owner = error
370
371class SocketFormatter (GenericFormatter):
372 TYPE = 'socket'
373class PipeFormatter (GenericFormatter):
374 TYPE = 'fifo'
375
376class LinkFormatter (GenericFormatter):
377 TYPE = 'symbolic-link'
378 def name(me):
379 n = GenericFormatter.name(me)
380 try:
381 d = OS.readlink(me.fi.name)
382 return '%s -> %s' % (n, me._enc_name(d))
383 except OSError, err:
384 return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
385
386class DirectoryFormatter (GenericFormatter):
387 TYPE = 'directory'
388 def name(me): return GenericFormatter.name(me) + '/'
389 def size(me): return 'dir'
390
391class DeviceFormatter (GenericFormatter):
392 def info(me):
393 return '%s %d:%d' % (me.TYPE,
394 OS.major(me.fi.st.st_rdev),
395 OS.minor(me.fi.st.st_rdev))
396class BlockDeviceFormatter (DeviceFormatter):
397 TYPE = 'block-device'
398class CharDeviceFormatter (DeviceFormatter):
399 TYPE = 'character-device'
400
401class FileFormatter (GenericFormatter):
402 TYPE = 'regular-file'
403
404class Reporter (object):
405
406 TYMAP = {
407 ST.S_IFSOCK: SocketFormatter,
408 ST.S_IFDIR: DirectoryFormatter,
409 ST.S_IFLNK: LinkFormatter,
410 ST.S_IFREG: FileFormatter,
411 ST.S_IFBLK: BlockDeviceFormatter,
412 ST.S_IFCHR: CharDeviceFormatter,
413 ST.S_IFIFO: PipeFormatter,
414 }
415
416 def __init__(me, db):
417 me._inomap = {}
418 me._vinomap = {}
419 me._db = db
420 me._hsz = int(H.new(db.hash).digest_size)
421
422 def file(me, fi):
423 h = me._db.hashfile(fi)
424 if fi.err:
425 fmt = ErrorFormatter(fi)
426 vino = 'error'
427 else:
428 fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
429 inoidx = fi.st.st_dev, fi.st.st_ino
430 try:
431 vino = me._inomap[inoidx]
432 except KeyError:
433 suffix = ''
434 seq = 0
435 while True:
436 vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
437 if vino not in me._vinomap: break
438 suffix = '\0%d' % seq
439 seq += 1
440 me._inomap[inoidx] = vino
441 if h: info = h
442 else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
443 print '%s %8s %6s %-12s %-20s %20s %s' % (
444 info, vino, fmt.mode(), fmt.owner(),
445 fmt.mtime(), fmt.size(), fmt.name())
446
447###--------------------------------------------------------------------------
448### Main program.
449
450FMTMAP = {
451 'rsync': lambda f: enum_rsync(stdin, f),
452 'find0': lambda f: enum_find0(stdin, f)
453}
454op = OP.OptionParser(
455 usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
456 version = '%%prog, version %s' % VERSION,
457 description = '''\
458Print a digest of a filesystem (or a collection of specified files) to
459standard output. The idea is that the digest should be mostly /complete/
460(i.e., any `interesting\' change to the filesystem results in a different
461digest) and /canonical/ (i.e., identical filesystem contents result in
462identical output).
463''')
464
465for short, long, props in [
466 ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
467 'help': 'clear cache of all files not seen' }),
468 ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
469 'help': 'use FILE as a cache for file hashes' }),
470 ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
471 'type': 'choice', 'choices': FMTMAP.keys(),
472 'help': 'read files to report in the given FORMAT' }),
473 ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
474 ##'type': 'choice', 'choices': H.algorithms,
475 'help': 'use HASH as the hash function' })]:
476 op.add_option(short, long, **props)
477opts, args = op.parse_args(argv)
478
479if not opts.files and len(args) <= 1:
480 die("no filename sources: nothing to do")
481db = HashCache(opts.cache, opts.hash)
482if opts.all:
483 db.reset()
484rep = Reporter(db)
485if opts.files:
486 FMTMAP[opts.files](rep.file)
487for dir in args[1:]:
488 enum_walk(dir, rep.file)
489if opts.all:
490 db.prune()
491db.flush()
492
493###----- That's all, folks --------------------------------------------------