[rsync-backup] / fshash.in

#! @PYTHON@
###
### Efficiently construct canonical digests of filesystems
###
### (c) 2012 Mark Wooding
###

###----- Licensing notice ---------------------------------------------------
###
### This file is part of the `rsync-backup' program.
###
### rsync-backup is free software; you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation; either version 2 of the License, or
### (at your option) any later version.
###
### rsync-backup is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with rsync-backup; if not, write to the Free Software Foundation,
### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

from sys import argv, exit, stdin, stdout, stderr
import os as OS
import re as RX
import time as T
import stat as ST
import optparse as OP
import hashlib as H
import sqlite3 as DB
import zlib as Z

PACKAGE = '@PACKAGE@'
VERSION = '@VERSION@'

###--------------------------------------------------------------------------
### Utilities.

QUIS = OS.path.basename(argv[0])

def moan(msg):
  stderr.write('%s: %s\n' % (QUIS, msg))

def die(msg, rc = 1):
  moan(msg)
  exit(rc)

SYSERR = 0
def syserr(msg):
  global SYSERR
  moan(msg)
  SYSERR += 1

###--------------------------------------------------------------------------
### File system enumeration.

class FileInfo (object):
  def __init__(me, file, st = None):
    me.name = file
    if st:
      me.st = st
      me.err = None
    else:
      try:
        me.st = OS.lstat(file)
        me.err = None
      except OSError, err:
        me.st = None
        me.err = err

def enum_walk(file, func):

  def dirents(name):
    try:
      return OS.listdir(name)
    except OSError, err:
      syserr("failed to read directory `%s': %s" % (name, err.strerror))
      return []

  def dir(ee, dev):
    ff = []
    dd = []
    for e in ee:
      fi = FileInfo(e)
      if fi.st and fi.st.st_dev != dev: pass
      if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
      else: ff.append(fi)
    ff.sort(key = lambda fi: fi.name)
    dd.sort(key = lambda fi: fi.name + '/')
    for f in ff:
      func(f)
    for d in dd:
      if d.st.st_dev == dev:
        func(d)
        dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)

  if file.endswith('/'):
    OS.chdir(file)
    fi = FileInfo('.')
    func(fi)
    dir(dirents('.'), fi.st.st_dev)
  else:
    fi = FileInfo(file)
    func(fi)
    if fi.st and ST.S_ISDIR(fi.st.st_mode):
      dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
          fi.st.st_dev)

def enum_find0(f, func):
  tail = ""
  while True:
    buf = f.read(8192)
    last = len(buf) == 0
    names = (tail + buf).split('\0')
    tail = names.pop()
    for n in names:
      func(FileInfo(n))
    if last:
      break
  if len(tail):
    moan("ignored trailing junk after last filename")

RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
def enum_rsync(f, func):

  ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
  ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
  ## one space character after the TIME and may begin with a space.
  ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
  ## for a byte with that value.  Newlines and backslashes which would be
  ## ambiguous are converted into this form; all other characters are
  ## literal.
  ##
  ## We ignore the stat information and retrieve it ourselves, because it's
  ## incomplete.  Hopefully the dcache is still warm.

  for line in f:
    if line.endswith('\n'): line = line[:-1]

    ## Extract the escaped name.
    ff = line.split(None, 3)
    if len(ff) != 4:
      syserr("ignoring invalid line from rsync: `%s'" % line)
      continue
    tail = ff[3]
    try:
      spc = tail.index(' ')
    except ValueError:
      syserr("ignoring invalid line from rsync: `%s'" % line)
      continue
    name = tail[spc + 1:]

    ## Now translate escape sequences.
    name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)

    ## Call the client.
    try:
      fi = FileInfo(name)
    except OSError, err:
      syserr("failed to stat `%s': %s" % (name, err.strerror))
      continue
    func(fi)

###--------------------------------------------------------------------------
### The hash cache.

class HashCache (object):

  VERSION = 0
  BUFSZ = 128*1024

  INIT = [
    """CREATE TABLE meta (
               version INTEGER NOT NULL,
               hash TEXT NOT NULL
       );""",
    """CREATE TABLE hash (
               ino INTEGER PRIMARY KEY,
               mtime INTEGER NOT NULL,
               ctime INTEGER NOT NULL,
               size INTEGER NOT NULL,
               hash TEXT NOT NULL,
               seen BOOLEAN NOT NULL DEFAULT TRUE
       );""",
    """PRAGMA journal_mode = WAL;"""
  ]

  def __init__(me, file, hash = None):

    if file is None:

      ## We're going this alone, with no cache.
      db = None
      if hash is None:
        die("no hash specified and no database cache to read from")
    else:

      ## Connect to the database.
      db = DB.connect(file)
      db.text_factory = str

      ## See whether we can understand the cache database.
      c = db.cursor()
      v = h = None
      try:
        c.execute('SELECT version, hash FROM meta')
        v, h = c.fetchone()
        if c.fetchone() is not None:
          die("cache database corrupt: meta table has mutliple rows")
      except (DB.Error, TypeError):
        pass

      ## If that didn't work, we'd better clear the thing and start again.
      ## But only if we know how to initialize it.
      if v != me.VERSION:

        ## Explain the situation.
        moan("cache version %s not understood" % v)
        if hash is None:
          if h is None:
            die("can't initialize cache: no hash function set")
          else:
            hash = h
        try:
          H.new(hash)
        except Exception:
          die("unknown hash function `%s'" % hash)

        ## Drop old things.
        c.execute('SELECT type, name FROM sqlite_master')
        for type, name in c.fetchall():
          c.execute('DROP %s IF EXISTS %s' % (type, name))

        ## Now we're ready to go.
        for stmt in me.INIT:
          c.execute(stmt)
        c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
        db.commit()

      ## Check the hash function if necessary.
      if hash is None:
        hash = h
      elif h is not None and  h != hash:
        die("hash mismatch: cache uses %s but %s requested" % (h, hash))

    ## All done.
    me.hash = hash
    me._db = db
    me._pend = 0

  def hashfile(me, fi):

    ## If this isn't a proper file then don't try to hash it.
    if fi.err or not ST.S_ISREG(fi.st.st_mode):
      return None

    ## See whether there's a valid entry in the cache.
    if me._db:
      c = me._db.cursor()
      c.execute(
        'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
        [fi.st.st_ino])
      r = c.fetchone()
      if r is not None:
        mt, sz, h, s = r
        if mt == fi.st.st_mtime and \
           sz == fi.st.st_size:
          if not s:
            c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
                      [fi.st.st_ino])
          me._update()
          return h

    ## Hash the file.  Beware raciness: update the file information from the
    ## open descriptor, but set the size from what we actually read.
    h = H.new(me.hash)
    try:
      with open(fi.name, 'rb') as f:
        sz = 0
        while True:
          buf = f.read(me.BUFSZ)
          if len(buf) == 0:
            break
          sz += len(buf)
          h.update(buf)
        fi.st = OS.fstat(f.fileno())
        ##fi.st.st_size = sz
      hash = h.digest()
    except (OSError, IOError), err:
      fi.st = None
      fi.err = err
      return None
    hash = hash.encode('hex')

    ## Insert a record into the database.
    if me._db:
      c.execute("""
              INSERT OR REPLACE INTO hash
                      (ino, mtime, ctime, size, hash, seen)
              VALUES
                      (?, ?, ?, ?, ?, 1);
      """, [fi.st.st_ino,
            fi.st.st_mtime,
            fi.st.st_ctime,
            fi.st.st_size,
            hash])
      me._update()

    ## Done.
    return hash

  def _update(me):
    me._pend += 1
    if me._pend >= 1024:
      me.flush()

  def flush(me):
    if me._db:
      me._db.commit()
    me._pend = 0

  def need_db(me):
    if not me._db:
      die("no cache database")

  def reset(me):
    me.need_db()
    c = me._db.cursor()
    c.execute('UPDATE hash SET seen = 0 WHERE seen')
    me.flush()

  def prune(me):
    me.need_db()
    c = me._db.cursor()
    c.execute('DELETE FROM hash WHERE NOT seen')
    me.flush()

###--------------------------------------------------------------------------
### Printing output.

class GenericFormatter (object):
  def __init__(me, fi):
    me.fi = fi
  def _fmt_time(me, t):
    tm = T.gmtime(t)
    return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
  def _enc_name(me, n):
    return ' \\-> '.join(n.encode('string_escape').split(' -> '))
  def name(me):
    return me._enc_name(me.fi.name)
  def info(me):
    return me.TYPE
  def mode(me):
    return '%06o' % me.fi.st.st_mode
  def size(me):
    return me.fi.st.st_size
  def mtime(me):
    return me._fmt_time(me.fi.st.st_mtime)
  def owner(me):
    return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)

class ErrorFormatter (GenericFormatter):
  def info(me):
    return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
  def error(me): return 'error'
  mode = size = mtime = owner = error

class SocketFormatter (GenericFormatter):
  TYPE = 'socket'
class PipeFormatter (GenericFormatter):
  TYPE = 'fifo'

class LinkFormatter (GenericFormatter):
  TYPE = 'symbolic-link'
  def name(me):
    n = GenericFormatter.name(me)
    try:
      d = OS.readlink(me.fi.name)
      return '%s -> %s' % (n, me._enc_name(d))
    except OSError, err:
      return '%s -> <E%d %s>' % (n, err.errno, err.strerror)

class DirectoryFormatter (GenericFormatter):
  TYPE = 'directory'
  def name(me): return GenericFormatter.name(me) + '/'
  def size(me): return 'dir'

class DeviceFormatter (GenericFormatter):
  def info(me):
    return '%s %d:%d' % (me.TYPE,
                         OS.major(me.fi.st.st_rdev),
                         OS.minor(me.fi.st.st_rdev))
class BlockDeviceFormatter (DeviceFormatter):
  TYPE = 'block-device'
class CharDeviceFormatter (DeviceFormatter):
  TYPE = 'character-device'

class FileFormatter (GenericFormatter):
  TYPE = 'regular-file'

class Reporter (object):

  TYMAP = {
    ST.S_IFSOCK: SocketFormatter,
    ST.S_IFDIR: DirectoryFormatter,
    ST.S_IFLNK: LinkFormatter,
    ST.S_IFREG: FileFormatter,
    ST.S_IFBLK: BlockDeviceFormatter,
    ST.S_IFCHR: CharDeviceFormatter,
    ST.S_IFIFO: PipeFormatter,
  }

  def __init__(me, db):
    me._inomap = {}
    me._vinomap = {}
    me._db = db
    me._hsz = int(H.new(db.hash).digest_size)

  def file(me, fi):
    h = me._db.hashfile(fi)
    if fi.err:
      fmt = ErrorFormatter(fi)
      vino = 'error'
    else:
      fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
      inoidx = fi.st.st_dev, fi.st.st_ino
      try:
        vino = me._inomap[inoidx]
      except KeyError:
        suffix = ''
        seq = 0
        while True:
          vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
          if vino not in me._vinomap: break
          suffix = '\0%d' % seq
          seq += 1
        me._inomap[inoidx] = vino
    if h: info = h
    else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
    print '%s %8s %6s %-12s %-20s %20s %s' % (
      info, vino, fmt.mode(), fmt.owner(),
      fmt.mtime(), fmt.size(), fmt.name())

###--------------------------------------------------------------------------
### Main program.

FMTMAP = {
  'rsync': lambda f: enum_rsync(stdin, f),
  'find0': lambda f: enum_find0(stdin, f)
}
op = OP.OptionParser(
  usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
  version = '%%prog, version %s' % VERSION,
  description = '''\
Print a digest of a filesystem (or a collection of specified files) to
standard output.  The idea is that the digest should be mostly /complete/
(i.e., any `interesting\' change to the filesystem results in a different
digest) and /canonical/ (i.e., identical filesystem contents result in
identical output).
''')

for short, long, props in [
  ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
                    'help': 'clear cache of all files not seen' }),
  ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
                      'help': 'use FILE as a cache for file hashes' }),
  ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
                      'type': 'choice', 'choices': FMTMAP.keys(),
                      'help': 'read files to report in the given FORMAT' }),
  ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
                     ##'type': 'choice', 'choices': H.algorithms,
                     'help': 'use HASH as the hash function' })]:
  op.add_option(short, long, **props)
opts, args = op.parse_args(argv)

if not opts.files and len(args) <= 1:
  die("no filename sources: nothing to do")
db = HashCache(opts.cache, opts.hash)
if opts.all:
  db.reset()
rep = Reporter(db)
if opts.files:
  FMTMAP[opts.files](rep.file)
for dir in args[1:]:
  enum_walk(dir, rep.file)
if opts.all:
  db.prune()
db.flush()

###----- That's all, folks --------------------------------------------------
Commit	Line	Data
f6b4ffdc MW	1	#! @PYTHON@
	2	###
	3	### Efficiently construct canonical digests of filesystems
	4	###
	5	### (c) 2012 Mark Wooding
	6	###
	7
	8	###----- Licensing notice ---------------------------------------------------
	9	###
	10	### This file is part of the `rsync-backup' program.
	11	###
	12	### rsync-backup is free software; you can redistribute it and/or modify
	13	### it under the terms of the GNU General Public License as published by
	14	### the Free Software Foundation; either version 2 of the License, or
	15	### (at your option) any later version.
	16	###
	17	### rsync-backup is distributed in the hope that it will be useful,
	18	### but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	### GNU General Public License for more details.
	21	###
	22	### You should have received a copy of the GNU General Public License
	23	### along with rsync-backup; if not, write to the Free Software Foundation,
	24	### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
	25
	26	from sys import argv, exit, stdin, stdout, stderr
	27	import os as OS
	28	import re as RX
	29	import time as T
	30	import stat as ST
	31	import optparse as OP
	32	import hashlib as H
	33	import sqlite3 as DB
	34	import zlib as Z
	35
	36	PACKAGE = '@PACKAGE@'
	37	VERSION = '@VERSION@'
	38
	39	###--------------------------------------------------------------------------
	40	### Utilities.
	41
	42	QUIS = OS.path.basename(argv[0])
	43
	44	def moan(msg):
	45	stderr.write('%s: %s\n' % (QUIS, msg))
	46
	47	def die(msg, rc = 1):
	48	moan(msg)
	49	exit(rc)
	50
	51	SYSERR = 0
	52	def syserr(msg):
	53	global SYSERR
	54	moan(msg)
	55	SYSERR += 1
	56
	57	###--------------------------------------------------------------------------
	58	### File system enumeration.
	59
	60	class FileInfo (object):
	61	def __init__(me, file, st = None):
	62	me.name = file
	63	if st:
	64	me.st = st
65	me.err = None
66	else:
67	try:
68	me.st = OS.lstat(file)
69	me.err = None
70	except OSError, err:
71	me.st = None
72	me.err = err
73
74	def enum_walk(file, func):
75
76	def dirents(name):
77	try:
78	return OS.listdir(name)
79	except OSError, err:
80	syserr("failed to read directory `%s': %s" % (name, err.strerror))
81	return []
82
83	def dir(ee, dev):
84	ff = []
85	dd = []
86	for e in ee:
87	fi = FileInfo(e)
88	if fi.st and fi.st.st_dev != dev: pass
89	if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
90	else: ff.append(fi)
91	ff.sort(key = lambda fi: fi.name)
92	dd.sort(key = lambda fi: fi.name + '/')
93	for f in ff:
94	func(f)
95	for d in dd:
96	if d.st.st_dev == dev:
97	func(d)
98	dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
99
100	if file.endswith('/'):
101	OS.chdir(file)
102	fi = FileInfo('.')
103	func(fi)
104	dir(dirents('.'), fi.st.st_dev)
105	else:
106	fi = FileInfo(file)
107	func(fi)
108	if fi.st and ST.S_ISDIR(fi.st.st_mode):
109	dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
110	fi.st.st_dev)
111
112	def enum_find0(f, func):
113	tail = ""
114	while True:
115	buf = f.read(8192)
116	last = len(buf) == 0
117	names = (tail + buf).split('\0')
118	tail = names.pop()
119	for n in names:
120	func(FileInfo(n))
121	if last:
122	break
123	if len(tail):
124	moan("ignored trailing junk after last filename")
125
126	RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
127	def enum_rsync(f, func):
128
129	## The format is a little fiddly. Each line consists of PERMS SIZE DATE
130	## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
131	## one space character after the TIME and may begin with a space.
132	## Sequences of the form `\#OOO' where OOO are three octal digits, stand
133	## for a byte with that value. Newlines and backslashes which would be
134	## ambiguous are converted into this form; all other characters are
135	## literal.
136	##
137	## We ignore the stat information and retrieve it ourselves, because it's
138	## incomplete. Hopefully the dcache is still warm.
139
140	for line in f:
141	if line.endswith('\n'): line = line[:-1]
142
143	## Extract the escaped name.
144	ff = line.split(None, 3)
145	if len(ff) != 4:
146	syserr("ignoring invalid line from rsync: `%s'" % line)
147	continue
148	tail = ff[3]
149	try:
150	spc = tail.index(' ')
151	except ValueError:
152	syserr("ignoring invalid line from rsync: `%s'" % line)
153	continue
154	name = tail[spc + 1:]
155
156	## Now translate escape sequences.
157	name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
158
159	## Call the client.
160	try:
161	fi = FileInfo(name)
162	except OSError, err:
163	syserr("failed to stat `%s': %s" % (name, err.strerror))
164	continue
165	func(fi)
166
167	###--------------------------------------------------------------------------
168	### The hash cache.
169
170	class HashCache (object):
171
172	VERSION = 0
173	BUFSZ = 128*1024
174
175	INIT = [
176	"""CREATE TABLE meta (
177	version INTEGER NOT NULL,
178	hash TEXT NOT NULL
179	);""",
180	"""CREATE TABLE hash (
181	ino INTEGER PRIMARY KEY,
182	mtime INTEGER NOT NULL,
183	ctime INTEGER NOT NULL,
184	size INTEGER NOT NULL,
185	hash TEXT NOT NULL,
186	seen BOOLEAN NOT NULL DEFAULT TRUE
187	);""",
188	"""PRAGMA journal_mode = WAL;"""
189	]
190
191	def __init__(me, file, hash = None):
192
193	if file is None:
194
195	## We're going this alone, with no cache.
196	db = None
197	if hash is None:
198	die("no hash specified and no database cache to read from")
199	else:
200
201	## Connect to the database.
202	db = DB.connect(file)
203	db.text_factory = str
204
205	## See whether we can understand the cache database.
206	c = db.cursor()
207	v = h = None
208	try:
209	c.execute('SELECT version, hash FROM meta')
210	v, h = c.fetchone()
211	if c.fetchone() is not None:
212	die("cache database corrupt: meta table has mutliple rows")
213	except (DB.Error, TypeError):
214	pass
215
216	## If that didn't work, we'd better clear the thing and start again.
217	## But only if we know how to initialize it.
218	if v != me.VERSION:
219
220	## Explain the situation.
221	moan("cache version %s not understood" % v)
222	if hash is None:
223	if h is None:
224	die("can't initialize cache: no hash function set")
225	else:
226	hash = h
227	try:
228	H.new(hash)
229	except Exception:
230	die("unknown hash function `%s'" % hash)
231
232	## Drop old things.
233	c.execute('SELECT type, name FROM sqlite_master')
234	for type, name in c.fetchall():
235	c.execute('DROP %s IF EXISTS %s' % (type, name))
236
237	## Now we're ready to go.
238	for stmt in me.INIT:
239	c.execute(stmt)
240	c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
241	db.commit()
242
243	## Check the hash function if necessary.
244	if hash is None:
245	hash = h
246	elif h is not None and h != hash:
247	die("hash mismatch: cache uses %s but %s requested" % (h, hash))
248
249	## All done.
250	me.hash = hash
251	me._db = db
252	me._pend = 0
253
254	def hashfile(me, fi):
255
256	## If this isn't a proper file then don't try to hash it.
257	if fi.err or not ST.S_ISREG(fi.st.st_mode):
258	return None
259
260	## See whether there's a valid entry in the cache.
261	if me._db:
262	c = me._db.cursor()
263	c.execute(
264	'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
265	[fi.st.st_ino])
266	r = c.fetchone()
267	if r is not None:
268	mt, sz, h, s = r
269	if mt == fi.st.st_mtime and \
270	sz == fi.st.st_size:
271	if not s:
272	c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
273	[fi.st.st_ino])
274	me._update()
275	return h
276
277	## Hash the file. Beware raciness: update the file information from the
278	## open descriptor, but set the size from what we actually read.
279	h = H.new(me.hash)
280	try:
281	with open(fi.name, 'rb') as f:
282	sz = 0
283	while True:
284	buf = f.read(me.BUFSZ)
285	if len(buf) == 0:
286	break
287	sz += len(buf)
288	h.update(buf)
289	fi.st = OS.fstat(f.fileno())
290	##fi.st.st_size = sz
291	hash = h.digest()
292	except (OSError, IOError), err:
293	fi.st = None
294	fi.err = err
295	return None
296	hash = hash.encode('hex')
297
298	## Insert a record into the database.
299	if me._db:
300	c.execute("""
301	INSERT OR REPLACE INTO hash
302	(ino, mtime, ctime, size, hash, seen)
303	VALUES
304	(?, ?, ?, ?, ?, 1);
305	""", [fi.st.st_ino,
306	fi.st.st_mtime,
307	fi.st.st_ctime,
308	fi.st.st_size,
309	hash])
310	me._update()
311
312	## Done.
313	return hash
314
315	def _update(me):
316	me._pend += 1
317	if me._pend >= 1024:
318	me.flush()
319
320	def flush(me):
321	if me._db:
322	me._db.commit()
323	me._pend = 0
324
325	def need_db(me):
326	if not me._db:
327	die("no cache database")
328
329	def reset(me):
330	me.need_db()
331	c = me._db.cursor()
332	c.execute('UPDATE hash SET seen = 0 WHERE seen')
333	me.flush()
334
335	def prune(me):
336	me.need_db()
337	c = me._db.cursor()
338	c.execute('DELETE FROM hash WHERE NOT seen')
339	me.flush()
340
341	###--------------------------------------------------------------------------
342	### Printing output.
343
344	class GenericFormatter (object):
345	def __init__(me, fi):
346	me.fi = fi
347	def _fmt_time(me, t):
348	tm = T.gmtime(t)
349	return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
350	def _enc_name(me, n):
8aeb0c53	351	return ' \\-> '.join(n.encode('string_escape').split(' -> '))
f6b4ffdc MW	352	def name(me):
	353	return me._enc_name(me.fi.name)
	354	def info(me):
	355	return me.TYPE
	356	def mode(me):
	357	return '%06o' % me.fi.st.st_mode
	358	def size(me):
	359	return me.fi.st.st_size
	360	def mtime(me):
	361	return me._fmt_time(me.fi.st.st_mtime)
	362	def owner(me):
	363	return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
	364
	365	class ErrorFormatter (GenericFormatter):
	366	def info(me):
	367	return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
	368	def error(me): return 'error'
	369	mode = size = mtime = owner = error
	370
	371	class SocketFormatter (GenericFormatter):
	372	TYPE = 'socket'
	373	class PipeFormatter (GenericFormatter):
	374	TYPE = 'fifo'
	375
	376	class LinkFormatter (GenericFormatter):
	377	TYPE = 'symbolic-link'
	378	def name(me):
	379	n = GenericFormatter.name(me)
	380	try:
	381	d = OS.readlink(me.fi.name)
	382	return '%s -> %s' % (n, me._enc_name(d))
	383	except OSError, err:
	384	return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
	385
	386	class DirectoryFormatter (GenericFormatter):
	387	TYPE = 'directory'
	388	def name(me): return GenericFormatter.name(me) + '/'
	389	def size(me): return 'dir'
	390
	391	class DeviceFormatter (GenericFormatter):
	392	def info(me):
	393	return '%s %d:%d' % (me.TYPE,
	394	OS.major(me.fi.st.st_rdev),
	395	OS.minor(me.fi.st.st_rdev))
	396	class BlockDeviceFormatter (DeviceFormatter):
	397	TYPE = 'block-device'
	398	class CharDeviceFormatter (DeviceFormatter):
	399	TYPE = 'character-device'
	400
	401	class FileFormatter (GenericFormatter):
	402	TYPE = 'regular-file'
	403
	404	class Reporter (object):
	405
	406	TYMAP = {
	407	ST.S_IFSOCK: SocketFormatter,
	408	ST.S_IFDIR: DirectoryFormatter,
	409	ST.S_IFLNK: LinkFormatter,
	410	ST.S_IFREG: FileFormatter,
	411	ST.S_IFBLK: BlockDeviceFormatter,
	412	ST.S_IFCHR: CharDeviceFormatter,
	413	ST.S_IFIFO: PipeFormatter,
	414	}
	415
416	def __init__(me, db):
417	me._inomap = {}
418	me._vinomap = {}
419	me._db = db
420	me._hsz = int(H.new(db.hash).digest_size)
421
422	def file(me, fi):
423	h = me._db.hashfile(fi)
424	if fi.err:
425	fmt = ErrorFormatter(fi)
426	vino = 'error'
427	else:
428	fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
429	inoidx = fi.st.st_dev, fi.st.st_ino
430	try:
431	vino = me._inomap[inoidx]
432	except KeyError:
433	suffix = ''
434	seq = 0
435	while True:
436	vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
437	if vino not in me._vinomap: break
438	suffix = '\0%d' % seq
439	seq += 1
440	me._inomap[inoidx] = vino
441	if h: info = h
442	else: info = '[%-s]' % (2me._hsz - 2, fmt.info())
443	print '%s %8s %6s %-12s %-20s %20s %s' % (
444	info, vino, fmt.mode(), fmt.owner(),
445	fmt.mtime(), fmt.size(), fmt.name())
446
447	###--------------------------------------------------------------------------
448	### Main program.
449
450	FMTMAP = {
451	'rsync': lambda f: enum_rsync(stdin, f),
452	'find0': lambda f: enum_find0(stdin, f)
453	}
454	op = OP.OptionParser(
455	usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
456	version = '%%prog, version %s' % VERSION,
457	description = '''\
458	Print a digest of a filesystem (or a collection of specified files) to
459	standard output. The idea is that the digest should be mostly /complete/
460	(i.e., any `interesting\' change to the filesystem results in a different
461	digest) and /canonical/ (i.e., identical filesystem contents result in
462	identical output).
463	''')
464
465	for short, long, props in [
466	('-a', '--all', { 'action': 'store_true', 'dest': 'all',
467	'help': 'clear cache of all files not seen' }),
468	('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
469	'help': 'use FILE as a cache for file hashes' }),
470	('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
471	'type': 'choice', 'choices': FMTMAP.keys(),
472	'help': 'read files to report in the given FORMAT' }),
473	('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
474	##'type': 'choice', 'choices': H.algorithms,
475	'help': 'use HASH as the hash function' })]:
476	op.add_option(short, long, **props)
477	opts, args = op.parse_args(argv)
478
479	if not opts.files and len(args) <= 1:
480	die("no filename sources: nothing to do")
481	db = HashCache(opts.cache, opts.hash)
482	if opts.all:
483	db.reset()
484	rep = Reporter(db)
485	if opts.files:
486	FMTMAP[opts.files](rep.file)
487	for dir in args[1:]:
488	enum_walk(dir, rep.file)
489	if opts.all:
490	db.prune()
491	db.flush()
492
493	###----- That's all, folks --------------------------------------------------