3 ### Efficiently construct canonical digests of filesystems
5 ### (c) 2012 Mark Wooding
8 ###----- Licensing notice ---------------------------------------------------
10 ### This file is part of the `rsync-backup' program.
12 ### rsync-backup is free software; you can redistribute it and/or modify
13 ### it under the terms of the GNU General Public License as published by
14 ### the Free Software Foundation; either version 2 of the License, or
15 ### (at your option) any later version.
17 ### rsync-backup is distributed in the hope that it will be useful,
18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ### GNU General Public License for more details.
22 ### You should have received a copy of the GNU General Public License
23 ### along with rsync-backup; if not, write to the Free Software Foundation,
24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26 from sys
import argv
, exit
, stdin
, stdout
, stderr
36 PACKAGE
= 'rsync-backup'
37 VERSION
= '0.99.1-8-ga844'
39 ###--------------------------------------------------------------------------
42 QUIS
= OS
.path
.basename(argv
[0])
45 stderr
.write('%s: %s\n' %
(QUIS
, msg
))
57 ###--------------------------------------------------------------------------
58 ### File system enumeration.
60 class FileInfo (object):
61 def __init__(me
, file, st
= None):
68 me
.st
= OS
.lstat(file)
74 def enum_walk(file, func
):
78 return OS
.listdir(name
)
80 syserr("failed to read directory `%s': %s" %
(name
, err
.strerror
))
88 if fi
.st
and fi
.st
.st_dev
!= dev
: pass
89 if fi
.st
and ST
.S_ISDIR(fi
.st
.st_mode
): dd
.append(fi
)
91 ff
.sort(key
= lambda fi
: fi
.name
)
92 dd
.sort(key
= lambda fi
: fi
.name
+ '/')
96 if d
.st
.st_dev
== dev
:
98 dir([OS
.path
.join(d
.name
, e
) for e
in dirents(d
.name
)], dev
)
100 if file.endswith('/'):
101 cwd
= OS
.open('.', OS
.O_RDONLY
)
106 dir(dirents('.'), fi
.st
.st_dev
)
113 if fi
.st
and ST
.S_ISDIR(fi
.st
.st_mode
):
114 dir([OS
.path
.join(fi
.name
, e
) for e
in dirents(fi
.name
)],
117 def enum_find0(f
, func
):
122 names
= (tail
+ buf
).split('\0')
129 moan("ignored trailing junk after last filename")
131 RX_RSYNCESC
= RX
.compile(r
'\\ \# ([0-7]{3})', RX
.VERBOSE
)
132 def enum_rsync(f
, func
):
134 ## The format is a little fiddly. Each line consists of PERMS SIZE DATE
135 ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
136 ## one space character after the TIME and may begin with a space.
137 ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
138 ## for a byte with that value. Newlines and backslashes which would be
139 ## ambiguous are converted into this form; all other characters are
142 ## We ignore the stat information and retrieve it ourselves, because it's
143 ## incomplete. Hopefully the dcache is still warm.
146 if line
.endswith('\n'): line
= line
[:-1]
148 ## Extract the escaped name.
149 ff
= line
.split(None, 3)
151 syserr("ignoring invalid line from rsync: `%s'" % line
)
155 spc
= tail
.index(' ')
157 syserr("ignoring invalid line from rsync: `%s'" % line
)
159 name
= tail
[spc
+ 1:]
161 ## Now translate escape sequences.
162 name
= RX_RSYNCESC
.sub(lambda m
: chr(int(m
.group(1), 8)), name
)
168 syserr("failed to stat `%s': %s" %
(name
, err
.strerror
))
172 ###--------------------------------------------------------------------------
175 class HashCache (object):
181 """CREATE TABLE meta (
182 version INTEGER NOT NULL,
185 """CREATE TABLE hash (
186 ino INTEGER PRIMARY KEY,
187 mtime INTEGER NOT NULL,
188 ctime INTEGER NOT NULL,
189 size INTEGER NOT NULL,
191 seen BOOLEAN NOT NULL DEFAULT TRUE
193 """PRAGMA journal_mode = WAL;"""
196 def __init__(me
, file, hash = None):
200 ## We're going this alone, with no cache.
203 die("no hash specified and no database cache to read from")
206 ## Connect to the database.
207 db
= DB
.connect(file)
208 db
.text_factory
= str
210 ## See whether we can understand the cache database.
214 c
.execute('SELECT version, hash FROM meta')
216 if c
.fetchone() is not None:
217 die("cache database corrupt: meta table has mutliple rows")
218 except (DB
.Error
, TypeError):
221 ## If that didn't work, we'd better clear the thing and start again.
222 ## But only if we know how to initialize it.
225 ## Explain the situation.
226 moan("cache version %s not understood" % v
)
229 die("can't initialize cache: no hash function set")
235 die("unknown hash function `%s'" %
hash)
238 c
.execute('SELECT type, name FROM sqlite_master')
239 for type, name
in c
.fetchall():
240 c
.execute('DROP %s IF EXISTS %s' %
(type, name
))
242 ## Now we're ready to go.
245 c
.execute('INSERT INTO meta VALUES (?, ?)', [me
.VERSION
, hash])
248 ## Check the hash function if necessary.
251 elif h
is not None and h
!= hash:
252 die("hash mismatch: cache uses %s but %s requested" %
(h
, hash))
259 def hashfile(me
, fi
):
261 ## If this isn't a proper file then don't try to hash it.
262 if fi
.err
or not ST
.S_ISREG(fi
.st
.st_mode
):
265 ## See whether there's a valid entry in the cache.
269 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
274 if mt
== fi
.st
.st_mtime
and \
277 c
.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
282 ## Hash the file. Beware raciness: update the file information from the
283 ## open descriptor, but set the size from what we actually read.
286 with
open(fi
.name
, 'rb') as f
:
289 buf
= f
.read(me
.BUFSZ
)
294 fi
.st
= OS
.fstat(f
.fileno())
297 except (OSError, IOError), err
:
301 hash = hash.encode('hex')
303 ## Insert a record into the database.
306 INSERT OR REPLACE INTO hash
307 (ino, mtime, ctime, size, hash, seen)
332 die("no cache database")
337 c
.execute('UPDATE hash SET seen = 0 WHERE seen')
343 c
.execute('DELETE FROM hash WHERE NOT seen')
346 ###--------------------------------------------------------------------------
349 class GenericFormatter (object):
350 def __init__(me
, fi
):
352 def _fmt_time(me
, t
):
354 return T
.strftime('%Y-%m-%dT%H:%M:%SZ', tm
)
355 def _enc_name(me
, n
):
356 return ' \\-> '.join(n
.encode('string_escape').split(' -> '))
358 return me
._enc_name(me
.fi
.name
)
362 return '%06o' % me
.fi
.st
.st_mode
364 return me
.fi
.st
.st_size
366 return me
._fmt_time(me
.fi
.st
.st_mtime
)
368 return '%5d:%d' %
(me
.fi
.st
.st_uid
, me
.fi
.st
.st_gid
)
370 class ErrorFormatter (GenericFormatter
):
372 return 'E%d %s' %
(me
.fi
.err
.errno
, me
.fi
.err
.strerror
)
373 def error(me
): return 'error'
374 mode
= size
= mtime
= owner
= error
376 class SocketFormatter (GenericFormatter
):
378 class PipeFormatter (GenericFormatter
):
381 class LinkFormatter (GenericFormatter
):
382 TYPE
= 'symbolic-link'
384 n
= GenericFormatter
.name(me
)
386 d
= OS
.readlink(me
.fi
.name
)
387 return '%s -> %s' %
(n
, me
._enc_name(d
))
389 return '%s -> <E%d %s>' %
(n
, err
.errno
, err
.strerror
)
391 class DirectoryFormatter (GenericFormatter
):
393 def name(me
): return GenericFormatter
.name(me
) + '/'
394 def size(me
): return 'dir'
396 class DeviceFormatter (GenericFormatter
):
398 return '%s %d:%d' %
(me
.TYPE
,
399 OS
.major(me
.fi
.st
.st_rdev
),
400 OS
.minor(me
.fi
.st
.st_rdev
))
401 class BlockDeviceFormatter (DeviceFormatter
):
402 TYPE
= 'block-device'
403 class CharDeviceFormatter (DeviceFormatter
):
404 TYPE
= 'character-device'
406 class FileFormatter (GenericFormatter
):
407 TYPE
= 'regular-file'
409 class Reporter (object):
412 ST
.S_IFSOCK
: SocketFormatter
,
413 ST
.S_IFDIR
: DirectoryFormatter
,
414 ST
.S_IFLNK
: LinkFormatter
,
415 ST
.S_IFREG
: FileFormatter
,
416 ST
.S_IFBLK
: BlockDeviceFormatter
,
417 ST
.S_IFCHR
: CharDeviceFormatter
,
418 ST
.S_IFIFO
: PipeFormatter
,
421 def __init__(me
, db
):
425 me
._hsz
= int(H
.new(db
.hash).digest_size
)
428 h
= me
._db
.hashfile(fi
)
430 fmt
= ErrorFormatter(fi
)
433 fmt
= me
.TYMAP
[ST
.S_IFMT(fi
.st
.st_mode
)](fi
)
434 inoidx
= fi
.st
.st_dev
, fi
.st
.st_ino
436 vino
= me
._inomap
[inoidx
]
441 vino
= '%08x' %
(Z
.crc32(fi
.name
+ suffix
) & 0xffffffff)
442 if vino
not in me
._vinomap
: break
443 suffix
= '\0%d' % seq
445 me
._inomap
[inoidx
] = vino
447 else: info
= '[%-*s]' %
(2*me
._hsz
- 2, fmt
.info())
448 print '%s %8s %6s %-12s %-20s %20s %s' %
(
449 info
, vino
, fmt
.mode(), fmt
.owner(),
450 fmt
.mtime(), fmt
.size(), fmt
.name())
452 ###--------------------------------------------------------------------------
456 'rsync': lambda f
: enum_rsync(stdin
, f
),
457 'find0': lambda f
: enum_find0(stdin
, f
)
459 op
= OP
.OptionParser(
460 usage
= '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
461 version
= '%%prog, version %s' % VERSION
,
463 Print a digest of a filesystem (or a collection of specified files) to
464 standard output. The idea is that the digest should be mostly /complete/
465 (i.e., any `interesting\' change to the filesystem results in a different
466 digest) and /canonical/ (i.e., identical filesystem contents result in
470 for short
, long, props
in [
471 ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
472 'help': 'clear cache of all files not seen' }),
473 ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
474 'help': 'use FILE as a cache for file hashes' }),
475 ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
476 'type': 'choice', 'choices': FMTMAP
.keys(),
477 'help': 'read files to report in the given FORMAT' }),
478 ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
479 ##'type': 'choice', 'choices': H.algorithms,
480 'help': 'use HASH as the hash function' })]:
481 op
.add_option(short
, long, **props
)
482 opts
, args
= op
.parse_args(argv
)
484 if not opts
.files
and len(args
) <= 1:
485 die("no filename sources: nothing to do")
486 db
= HashCache(opts
.cache
, opts
.hash)
491 FMTMAP
[opts
.files
](rep
.file)
493 enum_walk(dir, rep
.file)
498 ###----- That's all, folks --------------------------------------------------