[rsync-backup] / fshash.in

#! @PYTHON@
###
### Efficiently construct canonical digests of filesystems
###
### (c) 2012 Mark Wooding
###

###----- Licensing notice ---------------------------------------------------
###
### This file is part of the `rsync-backup' program.
###
### rsync-backup is free software; you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation; either version 2 of the License, or
### (at your option) any later version.
###
### rsync-backup is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with rsync-backup; if not, write to the Free Software Foundation,
### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

from sys import argv, exit, stdin, stdout, stderr
import os as OS
import re as RX
import time as T
import errno as E
import stat as ST
import optparse as OP
import hashlib as H
import sqlite3 as DB
import zlib as Z

PACKAGE = '@PACKAGE@'
VERSION = '@VERSION@'

###--------------------------------------------------------------------------
### Utilities.

QUIS = OS.path.basename(argv[0])

def moan(msg):
  stderr.write('%s: %s\n' % (QUIS, msg))

def die(msg, rc = 1):
  moan(msg)
  exit(rc)

SYSERR = 0
def syserr(msg):
  global SYSERR
  moan(msg)
  SYSERR += 1

###--------------------------------------------------------------------------
### File system enumeration.

class FileInfo (object):
  def __init__(me, file, st = None):
    me.name = file
    if st:
      me.st = st
      me.err = None
    else:
      try:
        me.st = OS.lstat(file)
        me.err = None
      except OSError, err:
        me.st = None
        me.err = err

def enum_walk(file, func):

  def dirents(name):
    try:
      return OS.listdir(name)
    except OSError, err:
      syserr("failed to read directory `%s': %s" % (name, err.strerror))
      return []

  def dir(ee, dev):
    ff = []
    dd = []
    for e in ee:
      fi = FileInfo(e)
      if fi.st and fi.st.st_dev != dev: pass
      if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
      else: ff.append(fi)
    ff.sort(key = lambda fi: fi.name)
    dd.sort(key = lambda fi: fi.name + '/')
    for f in ff:
      func(f)
    for d in dd:
      if d.st.st_dev == dev:
        func(d)
        dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)

  if file.endswith('/'):
    cwd = OS.open('.', OS.O_RDONLY)
    try:
      OS.chdir(file)
      fi = FileInfo('.')
      func(fi)
      dir(dirents('.'), fi.st.st_dev)
    finally:
      OS.fchdir(cwd)
      OS.close(cwd)
  else:
    fi = FileInfo(file)
    func(fi)
    if fi.st and ST.S_ISDIR(fi.st.st_mode):
      dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
          fi.st.st_dev)

def enum_find0(f, func):
  tail = ""
  while True:
    buf = f.read(8192)
    last = len(buf) == 0
    names = (tail + buf).split('\0')
    tail = names.pop()
    for n in names:
      func(FileInfo(n))
    if last:
      break
  if len(tail):
    moan("ignored trailing junk after last filename")

RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
def enum_rsync(f, func):

  ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
  ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
  ## one space character after the TIME and may begin with a space.
  ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
  ## for a byte with that value.  Newlines and backslashes which would be
  ## ambiguous are converted into this form; all other characters are
  ## literal.
  ##
  ## We ignore the stat information and retrieve it ourselves, because it's
  ## incomplete.  Hopefully the dcache is still warm.

  for line in f:
    if line.endswith('\n'): line = line[:-1]

    ## Extract the escaped name.
    ff = line.split(None, 3)
    if len(ff) != 4:
      syserr("ignoring invalid line from rsync: `%s'" % line)
      continue
    tail = ff[3]
    try:
      spc = tail.index(' ')
    except ValueError:
      syserr("ignoring invalid line from rsync: `%s'" % line)
      continue
    name = tail[spc + 1:]

    ## Now translate escape sequences.
    name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)

    ## Call the client.
    try:
      fi = FileInfo(name)
    except OSError, err:
      syserr("failed to stat `%s': %s" % (name, err.strerror))
      continue
    func(fi)

###--------------------------------------------------------------------------
### The hash cache.

class HashCache (object):

  VERSION = 0
  BUFSZ = 128*1024

  INIT = [
    """CREATE TABLE meta (
               version INTEGER NOT NULL,
               hash TEXT NOT NULL
       );""",
    """CREATE TABLE hash (
               ino INTEGER PRIMARY KEY,
               mtime INTEGER NOT NULL,
               ctime INTEGER NOT NULL,
               size INTEGER NOT NULL,
               hash TEXT NOT NULL,
               seen BOOLEAN NOT NULL DEFAULT TRUE
       );""",
    """PRAGMA journal_mode = WAL;"""
  ]

  def __init__(me, file, hash = None):

    if file is None:

      ## We're going this alone, with no cache.
      db = None
      if hash is None:
        die("no hash specified and no database cache to read from")
    else:

      ## Connect to the database.
      db = DB.connect(file)
      db.text_factory = str

      ## See whether we can understand the cache database.
      c = db.cursor()
      v = h = None
      try:
        c.execute('SELECT version, hash FROM meta')
        v, h = c.fetchone()
        if c.fetchone() is not None:
          die("cache database corrupt: meta table has mutliple rows")
      except (DB.Error, TypeError):
        pass

      ## If that didn't work, we'd better clear the thing and start again.
      ## But only if we know how to initialize it.
      if v != me.VERSION:

        ## Explain the situation.
        moan("cache version %s not understood" % v)
        if hash is None:
          if h is None:
            die("can't initialize cache: no hash function set")
          else:
            hash = h
        try:
          H.new(hash)
        except Exception:
          die("unknown hash function `%s'" % hash)

        ## Drop old things.
        c.execute('SELECT type, name FROM sqlite_master')
        for type, name in c.fetchall():
          c.execute('DROP %s IF EXISTS %s' % (type, name))

        ## Now we're ready to go.
        for stmt in me.INIT:
          c.execute(stmt)
        c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
        db.commit()

      ## Check the hash function if necessary.
      if hash is None:
        hash = h
      elif h is not None and  h != hash:
        die("hash mismatch: cache uses %s but %s requested" % (h, hash))

    ## All done.
    me.hash = hash
    me._db = db
    me._pend = 0

  def hashfile(me, fi):

    ## If this isn't a proper file then don't try to hash it.
    if fi.err or not ST.S_ISREG(fi.st.st_mode):
      return None

    ## See whether there's a valid entry in the cache.
    if me._db:
      c = me._db.cursor()
      c.execute(
        'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
        [fi.st.st_ino])
      r = c.fetchone()
      if r is not None:
        mt, sz, h, s = r
        if mt == fi.st.st_mtime and \
           sz == fi.st.st_size:
          if not s:
            c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
                      [fi.st.st_ino])
          me._update()
          return h

    ## Hash the file.  Beware raciness: update the file information from the
    ## open descriptor, but set the size from what we actually read.
    h = H.new(me.hash)
    try:
      with open(fi.name, 'rb') as f:
        sz = 0
        while True:
          buf = f.read(me.BUFSZ)
          if len(buf) == 0:
            break
          sz += len(buf)
          h.update(buf)
        fi.st = OS.fstat(f.fileno())
        ##fi.st.st_size = sz
      hash = h.digest()
    except (OSError, IOError), err:
      fi.st = None
      fi.err = err
      return None
    hash = hash.encode('hex')

    ## Insert a record into the database.
    if me._db:
      c.execute("""
              INSERT OR REPLACE INTO hash
                      (ino, mtime, ctime, size, hash, seen)
              VALUES
                      (?, ?, ?, ?, ?, 1);
      """, [fi.st.st_ino,
            fi.st.st_mtime,
            fi.st.st_ctime,
            fi.st.st_size,
            hash])
      me._update()

    ## Done.
    return hash

  def _update(me):
    me._pend += 1
    if me._pend >= 1024:
      me.flush()

  def flush(me):
    if me._db:
      me._db.commit()
    me._pend = 0

  def need_db(me):
    if not me._db:
      die("no cache database")

  def forget(me, ino):
    me.need_db()
    c = me._db.cursor()
    c.execute('DELETE FROM hash WHERE ino = ?', [ino])

  def reset(me):
    me.need_db()
    c = me._db.cursor()
    c.execute('UPDATE hash SET seen = 0 WHERE seen')
    me.flush()

  def prune(me):
    me.need_db()
    c = me._db.cursor()
    c.execute('DELETE FROM hash WHERE NOT seen')
    me.flush()

###--------------------------------------------------------------------------
### Printing output.

class GenericFormatter (object):
  def __init__(me, fi):
    me.fi = fi
  def _fmt_time(me, t):
    tm = T.gmtime(t)
    return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
  def _enc_name(me, n):
    return ' \\-> '.join(n.encode('string_escape').split(' -> '))
  def name(me):
    return me._enc_name(me.fi.name)
  def info(me):
    return me.TYPE
  def mode(me):
    return '%06o' % me.fi.st.st_mode
  def size(me):
    return me.fi.st.st_size
  def mtime(me):
    return me._fmt_time(me.fi.st.st_mtime)
  def owner(me):
    return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)

class ErrorFormatter (GenericFormatter):
  def info(me):
    return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
  def error(me): return 'error'
  mode = size = mtime = owner = error

class SocketFormatter (GenericFormatter):
  TYPE = 'socket'
class PipeFormatter (GenericFormatter):
  TYPE = 'fifo'

class LinkFormatter (GenericFormatter):
  TYPE = 'symbolic-link'
  def name(me):
    n = GenericFormatter.name(me)
    try:
      d = OS.readlink(me.fi.name)
      return '%s -> %s' % (n, me._enc_name(d))
    except OSError, err:
      return '%s -> <E%d %s>' % (n, err.errno, err.strerror)

class DirectoryFormatter (GenericFormatter):
  TYPE = 'directory'
  def name(me): return GenericFormatter.name(me) + '/'
  def size(me): return 'dir'

class DeviceFormatter (GenericFormatter):
  def info(me):
    return '%s %d:%d' % (me.TYPE,
                         OS.major(me.fi.st.st_rdev),
                         OS.minor(me.fi.st.st_rdev))
class BlockDeviceFormatter (DeviceFormatter):
  TYPE = 'block-device'
class CharDeviceFormatter (DeviceFormatter):
  TYPE = 'character-device'

class FileFormatter (GenericFormatter):
  TYPE = 'regular-file'

class Reporter (object):

  TYMAP = {
    ST.S_IFSOCK: SocketFormatter,
    ST.S_IFDIR: DirectoryFormatter,
    ST.S_IFLNK: LinkFormatter,
    ST.S_IFREG: FileFormatter,
    ST.S_IFBLK: BlockDeviceFormatter,
    ST.S_IFCHR: CharDeviceFormatter,
    ST.S_IFIFO: PipeFormatter,
  }

  def __init__(me, db):
    me._inomap = {}
    me._vinomap = {}
    me._db = db
    me._hsz = int(H.new(db.hash).digest_size)

  def file(me, fi):
    h = me._db.hashfile(fi)
    if fi.err:
      fmt = ErrorFormatter(fi)
      vino = 'error'
    else:
      fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
      inoidx = fi.st.st_dev, fi.st.st_ino
      try:
        vino = me._inomap[inoidx]
      except KeyError:
        suffix = ''
        seq = 0
        while True:
          vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
          if vino not in me._vinomap: break
          suffix = '\0%d' % seq
          seq += 1
        me._inomap[inoidx] = vino
    if h: info = h
    else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
    print '%s %8s %6s %-12s %-20s %20s %s' % (
      info, vino, fmt.mode(), fmt.owner(),
      fmt.mtime(), fmt.size(), fmt.name())

###--------------------------------------------------------------------------
### Database clearing from diff files.

R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')

def clear_entry(db, lno, line):

  good = True

  if line.startswith('['):
    pos = line.find(']')
    if pos < 0:
      moan("failed to parse file entry (type field; line %d)" % lno)
      return False
    ty = line[1:pos].strip()
    rest = line[pos + 1:]
    hash = None
  else:
    ff = line.split(None, 1)
    if len(ff) != 2:
      moan("failed to parse file entry (field split; line %d)" % lno)
      return False
    ty = 'regular-file'
    hash, rest = ff

  ff = rest.split(None, 5)
  if len(ff) != 6:
    moan("failed to parse file entry (field split; line %d)" % lno)
    return False
  ino, mode, uidgid, mtime, sz, name = ff

  if ty != 'symbolic-link':
    target = None
  else:
    nn = name.split(' -> ', 1)
    if len(nn) != 2:
      moan("failed to parse file entry (name split; line %d)" % lno)
      return False
    name, target = nn
    target = target.decode('string_escape')
  name = name.decode('string_escape')

  try:
    st = OS.lstat(name)
  except OSError, e:
    moan("failed to stat `%s': %s" % (name, e.strerror))
    if e.errno != E.ENOENT: good = False
  else:
    print "Clear cache entry for `%s'" % name
    db.forget(st.st_ino)

  return good

def clear_cache(db):

  ## Work through the input diff file one line at a time.
  diffstate = 'gap'
  lno = 0
  good = True
  for line in stdin:
    if line.endswith('\n'): line = line[:-1]
    lno += 1

    ## We're in a gap between hunks.  Find a hunk header and extract the line
    ## counts.
    if diffstate == 'gap':
      m = R_HUNK.match(line)
      if m:
        oldlines = int(m.group(1))
        newlines = int(m.group(2))
        diffstate = 'hunk'
        hdrlno = lno

    ## We're in a hunk.  Keep track of whether we've reached the end, and
    ## discard entries from the cache for mismatching lines.
    elif diffstate == 'hunk':
      if len(line) == 0:
        moan("empty line in diff hunk (line %d)" % lno)
        good = False
      ty = line[0]
      if ty == ' ':
        oldlines -= 1; newlines -= 1
      elif ty == '+':
        newlines -= 1
        if not clear_entry(db, lno, line[1:]): good = False
      elif ty == '-':
        oldlines -= 1
        if not clear_entry(db, lno, line[1:]): good = False
      else:
        moan("incomprehensible line in diff hunk (line %d)" % lno)
        good = false
      if oldlines < 0 or newlines < 0:
        moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
        good = False
      if oldlines == newlines == 0:
        diffstate = 'gap'

  if diffstate == 'hunk':
    moan("truncated diff hunk (started at line %d)" % hdrlno)
    good = False

  return good

###--------------------------------------------------------------------------
### Main program.

FMTMAP = {
  'rsync': lambda f: enum_rsync(stdin, f),
  'find0': lambda f: enum_find0(stdin, f)
}
op = OP.OptionParser(
  usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
  version = '%%prog, version %s' % VERSION,
  description = '''\
Print a digest of a filesystem (or a collection of specified files) to
standard output.  The idea is that the digest should be mostly /complete/
(i.e., any `interesting\' change to the filesystem results in a different
digest) and /canonical/ (i.e., identical filesystem contents result in
identical output).
''')

for short, long, props in [
  ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
                    'help': 'clear cache of all files not seen' }),
  ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
                      'help': 'use FILE as a cache for file hashes' }),
  ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
                      'type': 'choice', 'choices': FMTMAP.keys(),
                      'help': 'read files to report in the given FORMAT' }),
  ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
                      'help': 'read diff from stdin, clear cache entries' }),
  ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
                     ##'type': 'choice', 'choices': H.algorithms,
                     'help': 'use HASH as the hash function' })]:
  op.add_option(short, long, **props)
opts, args = op.parse_args(argv)

if opts.udiff:
  if opts.cache is None or opts.all or opts.files or len(args) > 2:
    die("incompatible options: `-u' requires `-c CACHE', forbids others")
  db = HashCache(opts.cache, opts.hash)
  if len(args) == 2: OS.chdir(args[1])
  good = True
  if not clear_cache(db): good = False
  if good: db.flush()
  else: exit(2)
else:
  if not opts.files and len(args) <= 1:
    die("no filename sources: nothing to do")
  db = HashCache(opts.cache, opts.hash)
  if opts.all:
    db.reset()
  rep = Reporter(db)
  if opts.files:
    FMTMAP[opts.files](rep.file)
  for dir in args[1:]:
    enum_walk(dir, rep.file)
  if opts.all:
    db.prune()
  db.flush()

###----- That's all, folks --------------------------------------------------
Commit	Line	Data
f6b4ffdc MW	1	#! @PYTHON@
	2	###
	3	### Efficiently construct canonical digests of filesystems
	4	###
	5	### (c) 2012 Mark Wooding
	6	###
	7
	8	###----- Licensing notice ---------------------------------------------------
	9	###
	10	### This file is part of the `rsync-backup' program.
	11	###
	12	### rsync-backup is free software; you can redistribute it and/or modify
	13	### it under the terms of the GNU General Public License as published by
	14	### the Free Software Foundation; either version 2 of the License, or
	15	### (at your option) any later version.
	16	###
	17	### rsync-backup is distributed in the hope that it will be useful,
	18	### but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	### GNU General Public License for more details.
	21	###
	22	### You should have received a copy of the GNU General Public License
	23	### along with rsync-backup; if not, write to the Free Software Foundation,
	24	### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
	25
	26	from sys import argv, exit, stdin, stdout, stderr
	27	import os as OS
	28	import re as RX
	29	import time as T
80d1feec	30	import errno as E
f6b4ffdc MW	31	import stat as ST
	32	import optparse as OP
	33	import hashlib as H
	34	import sqlite3 as DB
	35	import zlib as Z
	36
	37	PACKAGE = '@PACKAGE@'
	38	VERSION = '@VERSION@'
	39
	40	###--------------------------------------------------------------------------
	41	### Utilities.
	42
	43	QUIS = OS.path.basename(argv[0])
	44
	45	def moan(msg):
	46	stderr.write('%s: %s\n' % (QUIS, msg))
	47
	48	def die(msg, rc = 1):
	49	moan(msg)
	50	exit(rc)
	51
	52	SYSERR = 0
	53	def syserr(msg):
	54	global SYSERR
	55	moan(msg)
	56	SYSERR += 1
	57
	58	###--------------------------------------------------------------------------
	59	### File system enumeration.
	60
	61	class FileInfo (object):
	62	def __init__(me, file, st = None):
	63	me.name = file
	64	if st:
	65	me.st = st
	66	me.err = None
	67	else:
	68	try:
	69	me.st = OS.lstat(file)
	70	me.err = None
	71	except OSError, err:
	72	me.st = None
	73	me.err = err
	74
	75	def enum_walk(file, func):
	76
	77	def dirents(name):
	78	try:
	79	return OS.listdir(name)
	80	except OSError, err:
	81	syserr("failed to read directory `%s': %s" % (name, err.strerror))
	82	return []
	83
	84	def dir(ee, dev):
	85	ff = []
	86	dd = []
	87	for e in ee:
	88	fi = FileInfo(e)
	89	if fi.st and fi.st.st_dev != dev: pass
	90	if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
	91	else: ff.append(fi)
	92	ff.sort(key = lambda fi: fi.name)
	93	dd.sort(key = lambda fi: fi.name + '/')
	94	for f in ff:
95	func(f)
96	for d in dd:
97	if d.st.st_dev == dev:
98	func(d)
99	dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
100
101	if file.endswith('/'):
ad7e8534 MW	102	cwd = OS.open('.', OS.O_RDONLY)
	103	try:
	104	OS.chdir(file)
	105	fi = FileInfo('.')
	106	func(fi)
	107	dir(dirents('.'), fi.st.st_dev)
	108	finally:
	109	OS.fchdir(cwd)
	110	OS.close(cwd)
f6b4ffdc MW	111	else:
	112	fi = FileInfo(file)
	113	func(fi)
	114	if fi.st and ST.S_ISDIR(fi.st.st_mode):
	115	dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
	116	fi.st.st_dev)
	117
	118	def enum_find0(f, func):
	119	tail = ""
	120	while True:
	121	buf = f.read(8192)
	122	last = len(buf) == 0
	123	names = (tail + buf).split('\0')
	124	tail = names.pop()
	125	for n in names:
	126	func(FileInfo(n))
	127	if last:
	128	break
	129	if len(tail):
	130	moan("ignored trailing junk after last filename")
	131
	132	RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
	133	def enum_rsync(f, func):
	134
	135	## The format is a little fiddly. Each line consists of PERMS SIZE DATE
	136	## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
	137	## one space character after the TIME and may begin with a space.
	138	## Sequences of the form `\#OOO' where OOO are three octal digits, stand
	139	## for a byte with that value. Newlines and backslashes which would be
	140	## ambiguous are converted into this form; all other characters are
	141	## literal.
	142	##
	143	## We ignore the stat information and retrieve it ourselves, because it's
	144	## incomplete. Hopefully the dcache is still warm.
	145
	146	for line in f:
	147	if line.endswith('\n'): line = line[:-1]
	148
	149	## Extract the escaped name.
	150	ff = line.split(None, 3)
	151	if len(ff) != 4:
	152	syserr("ignoring invalid line from rsync: `%s'" % line)
	153	continue
	154	tail = ff[3]
	155	try:
	156	spc = tail.index(' ')
	157	except ValueError:
	158	syserr("ignoring invalid line from rsync: `%s'" % line)
	159	continue
	160	name = tail[spc + 1:]
	161
	162	## Now translate escape sequences.
	163	name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
	164
	165	## Call the client.
	166	try:
	167	fi = FileInfo(name)
	168	except OSError, err:
	169	syserr("failed to stat `%s': %s" % (name, err.strerror))
	170	continue
	171	func(fi)
	172
	173	###--------------------------------------------------------------------------
	174	### The hash cache.
175
176	class HashCache (object):
177
178	VERSION = 0
179	BUFSZ = 128*1024
180
181	INIT = [
182	"""CREATE TABLE meta (
183	version INTEGER NOT NULL,
184	hash TEXT NOT NULL
185	);""",
186	"""CREATE TABLE hash (
187	ino INTEGER PRIMARY KEY,
188	mtime INTEGER NOT NULL,
189	ctime INTEGER NOT NULL,
190	size INTEGER NOT NULL,
191	hash TEXT NOT NULL,
192	seen BOOLEAN NOT NULL DEFAULT TRUE
193	);""",
194	"""PRAGMA journal_mode = WAL;"""
195	]
196
197	def __init__(me, file, hash = None):
198
199	if file is None:
200
201	## We're going this alone, with no cache.
202	db = None
203	if hash is None:
204	die("no hash specified and no database cache to read from")
205	else:
206
207	## Connect to the database.
208	db = DB.connect(file)
209	db.text_factory = str
210
211	## See whether we can understand the cache database.
212	c = db.cursor()
213	v = h = None
214	try:
215	c.execute('SELECT version, hash FROM meta')
216	v, h = c.fetchone()
217	if c.fetchone() is not None:
218	die("cache database corrupt: meta table has mutliple rows")
219	except (DB.Error, TypeError):
220	pass
221
222	## If that didn't work, we'd better clear the thing and start again.
223	## But only if we know how to initialize it.
224	if v != me.VERSION:
225
226	## Explain the situation.
227	moan("cache version %s not understood" % v)
228	if hash is None:
229	if h is None:
230	die("can't initialize cache: no hash function set")
231	else:
232	hash = h
233	try:
234	H.new(hash)
235	except Exception:
236	die("unknown hash function `%s'" % hash)
237
238	## Drop old things.
239	c.execute('SELECT type, name FROM sqlite_master')
240	for type, name in c.fetchall():
241	c.execute('DROP %s IF EXISTS %s' % (type, name))
242
243	## Now we're ready to go.
244	for stmt in me.INIT:
245	c.execute(stmt)
246	c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
247	db.commit()
248
249	## Check the hash function if necessary.
250	if hash is None:
251	hash = h
252	elif h is not None and h != hash:
253	die("hash mismatch: cache uses %s but %s requested" % (h, hash))
254
255	## All done.
256	me.hash = hash
257	me._db = db
258	me._pend = 0
259
260	def hashfile(me, fi):
261
262	## If this isn't a proper file then don't try to hash it.
263	if fi.err or not ST.S_ISREG(fi.st.st_mode):
264	return None
265
266	## See whether there's a valid entry in the cache.
267	if me._db:
268	c = me._db.cursor()
269	c.execute(
270	'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
271	[fi.st.st_ino])
272	r = c.fetchone()
273	if r is not None:
274	mt, sz, h, s = r
275	if mt == fi.st.st_mtime and \
276	sz == fi.st.st_size:
277	if not s:
278	c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
279	[fi.st.st_ino])
280	me._update()
281	return h
282
283	## Hash the file. Beware raciness: update the file information from the
284	## open descriptor, but set the size from what we actually read.
285	h = H.new(me.hash)
286	try:
287	with open(fi.name, 'rb') as f:
288	sz = 0
289	while True:
290	buf = f.read(me.BUFSZ)
291	if len(buf) == 0:
292	break
293	sz += len(buf)
294	h.update(buf)
295	fi.st = OS.fstat(f.fileno())
296	##fi.st.st_size = sz
297	hash = h.digest()
298	except (OSError, IOError), err:
299	fi.st = None
300	fi.err = err
301	return None
302	hash = hash.encode('hex')
303
304	## Insert a record into the database.
305	if me._db:
306	c.execute("""
307	INSERT OR REPLACE INTO hash
308	(ino, mtime, ctime, size, hash, seen)
309	VALUES
310	(?, ?, ?, ?, ?, 1);
311	""", [fi.st.st_ino,
312	fi.st.st_mtime,
313	fi.st.st_ctime,
314	fi.st.st_size,
315	hash])
316	me._update()
317
318	## Done.
319	return hash
320
321	def _update(me):
322	me._pend += 1
323	if me._pend >= 1024:
324	me.flush()
325
326	def flush(me):
327	if me._db:
328	me._db.commit()
329	me._pend = 0
330
331	def need_db(me):
332	if not me._db:
333	die("no cache database")
334
80d1feec MW	335	def forget(me, ino):
	336	me.need_db()
	337	c = me._db.cursor()
	338	c.execute('DELETE FROM hash WHERE ino = ?', [ino])
	339
f6b4ffdc MW	340	def reset(me):
	341	me.need_db()
	342	c = me._db.cursor()
	343	c.execute('UPDATE hash SET seen = 0 WHERE seen')
	344	me.flush()
	345
	346	def prune(me):
	347	me.need_db()
	348	c = me._db.cursor()
	349	c.execute('DELETE FROM hash WHERE NOT seen')
	350	me.flush()
	351
	352	###--------------------------------------------------------------------------
	353	### Printing output.
	354
	355	class GenericFormatter (object):
	356	def __init__(me, fi):
	357	me.fi = fi
	358	def _fmt_time(me, t):
	359	tm = T.gmtime(t)
	360	return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
	361	def _enc_name(me, n):
8aeb0c53	362	return ' \\-> '.join(n.encode('string_escape').split(' -> '))
f6b4ffdc MW	363	def name(me):
	364	return me._enc_name(me.fi.name)
	365	def info(me):
	366	return me.TYPE
	367	def mode(me):
	368	return '%06o' % me.fi.st.st_mode
	369	def size(me):
	370	return me.fi.st.st_size
	371	def mtime(me):
	372	return me._fmt_time(me.fi.st.st_mtime)
	373	def owner(me):
	374	return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
	375
	376	class ErrorFormatter (GenericFormatter):
	377	def info(me):
	378	return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
	379	def error(me): return 'error'
	380	mode = size = mtime = owner = error
	381
	382	class SocketFormatter (GenericFormatter):
	383	TYPE = 'socket'
	384	class PipeFormatter (GenericFormatter):
	385	TYPE = 'fifo'
	386
	387	class LinkFormatter (GenericFormatter):
	388	TYPE = 'symbolic-link'
	389	def name(me):
	390	n = GenericFormatter.name(me)
	391	try:
	392	d = OS.readlink(me.fi.name)
	393	return '%s -> %s' % (n, me._enc_name(d))
	394	except OSError, err:
	395	return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
	396
	397	class DirectoryFormatter (GenericFormatter):
	398	TYPE = 'directory'
	399	def name(me): return GenericFormatter.name(me) + '/'
	400	def size(me): return 'dir'
	401
	402	class DeviceFormatter (GenericFormatter):
	403	def info(me):
	404	return '%s %d:%d' % (me.TYPE,
	405	OS.major(me.fi.st.st_rdev),
	406	OS.minor(me.fi.st.st_rdev))
	407	class BlockDeviceFormatter (DeviceFormatter):
	408	TYPE = 'block-device'
	409	class CharDeviceFormatter (DeviceFormatter):
	410	TYPE = 'character-device'
	411
	412	class FileFormatter (GenericFormatter):
	413	TYPE = 'regular-file'
	414
	415	class Reporter (object):
	416
	417	TYMAP = {
	418	ST.S_IFSOCK: SocketFormatter,
	419	ST.S_IFDIR: DirectoryFormatter,
	420	ST.S_IFLNK: LinkFormatter,
	421	ST.S_IFREG: FileFormatter,
	422	ST.S_IFBLK: BlockDeviceFormatter,
	423	ST.S_IFCHR: CharDeviceFormatter,
	424	ST.S_IFIFO: PipeFormatter,
	425	}
	426
427	def __init__(me, db):
428	me._inomap = {}
429	me._vinomap = {}
430	me._db = db
431	me._hsz = int(H.new(db.hash).digest_size)
432
433	def file(me, fi):
434	h = me._db.hashfile(fi)
435	if fi.err:
436	fmt = ErrorFormatter(fi)
437	vino = 'error'
438	else:
439	fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
440	inoidx = fi.st.st_dev, fi.st.st_ino
441	try:
442	vino = me._inomap[inoidx]
443	except KeyError:
444	suffix = ''
445	seq = 0
446	while True:
447	vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
448	if vino not in me._vinomap: break
449	suffix = '\0%d' % seq
450	seq += 1
451	me._inomap[inoidx] = vino
452	if h: info = h
453	else: info = '[%-s]' % (2me._hsz - 2, fmt.info())
454	print '%s %8s %6s %-12s %-20s %20s %s' % (
455	info, vino, fmt.mode(), fmt.owner(),
456	fmt.mtime(), fmt.size(), fmt.name())
457
458	###--------------------------------------------------------------------------
80d1feec MW	459	### Database clearing from diff files.
	460
	461	R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
	462
	463	def clear_entry(db, lno, line):
	464
	465	good = True
	466
	467	if line.startswith('['):
	468	pos = line.find(']')
	469	if pos < 0:
	470	moan("failed to parse file entry (type field; line %d)" % lno)
	471	return False
	472	ty = line[1:pos].strip()
	473	rest = line[pos + 1:]
	474	hash = None
	475	else:
	476	ff = line.split(None, 1)
	477	if len(ff) != 2:
	478	moan("failed to parse file entry (field split; line %d)" % lno)
	479	return False
	480	ty = 'regular-file'
	481	hash, rest = ff
	482
	483	ff = rest.split(None, 5)
	484	if len(ff) != 6:
	485	moan("failed to parse file entry (field split; line %d)" % lno)
	486	return False
	487	ino, mode, uidgid, mtime, sz, name = ff
	488
	489	if ty != 'symbolic-link':
	490	target = None
	491	else:
	492	nn = name.split(' -> ', 1)
	493	if len(nn) != 2:
	494	moan("failed to parse file entry (name split; line %d)" % lno)
	495	return False
	496	name, target = nn
	497	target = target.decode('string_escape')
	498	name = name.decode('string_escape')
	499
	500	try:
	501	st = OS.lstat(name)
	502	except OSError, e:
	503	moan("failed to stat `%s': %s" % (name, e.strerror))
	504	if e.errno != E.ENOENT: good = False
	505	else:
	506	print "Clear cache entry for `%s'" % name
	507	db.forget(st.st_ino)
	508
	509	return good
	510
	511	def clear_cache(db):
	512
	513	## Work through the input diff file one line at a time.
	514	diffstate = 'gap'
	515	lno = 0
	516	good = True
	517	for line in stdin:
	518	if line.endswith('\n'): line = line[:-1]
	519	lno += 1
	520
	521	## We're in a gap between hunks. Find a hunk header and extract the line
	522	## counts.
523	if diffstate == 'gap':
524	m = R_HUNK.match(line)
525	if m:
526	oldlines = int(m.group(1))
527	newlines = int(m.group(2))
528	diffstate = 'hunk'
529	hdrlno = lno
530
531	## We're in a hunk. Keep track of whether we've reached the end, and
532	## discard entries from the cache for mismatching lines.
533	elif diffstate == 'hunk':
534	if len(line) == 0:
535	moan("empty line in diff hunk (line %d)" % lno)
536	good = False
537	ty = line[0]
538	if ty == ' ':
539	oldlines -= 1; newlines -= 1
540	elif ty == '+':
541	newlines -= 1
542	if not clear_entry(db, lno, line[1:]): good = False
543	elif ty == '-':
544	oldlines -= 1
545	if not clear_entry(db, lno, line[1:]): good = False
546	else:
547	moan("incomprehensible line in diff hunk (line %d)" % lno)
548	good = false
549	if oldlines < 0 or newlines < 0:
550	moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
551	good = False
552	if oldlines == newlines == 0:
553	diffstate = 'gap'
554
555	if diffstate == 'hunk':
556	moan("truncated diff hunk (started at line %d)" % hdrlno)
557	good = False
558
559	return good
560
561	###--------------------------------------------------------------------------
f6b4ffdc MW	562	### Main program.
	563
	564	FMTMAP = {
	565	'rsync': lambda f: enum_rsync(stdin, f),
	566	'find0': lambda f: enum_find0(stdin, f)
	567	}
	568	op = OP.OptionParser(
80d1feec	569	usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
f6b4ffdc MW	570	version = '%%prog, version %s' % VERSION,
	571	description = '''\
	572	Print a digest of a filesystem (or a collection of specified files) to
	573	standard output. The idea is that the digest should be mostly /complete/
	574	(i.e., any `interesting\' change to the filesystem results in a different
	575	digest) and /canonical/ (i.e., identical filesystem contents result in
	576	identical output).
	577	''')
	578
	579	for short, long, props in [
	580	('-a', '--all', { 'action': 'store_true', 'dest': 'all',
	581	'help': 'clear cache of all files not seen' }),
	582	('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
	583	'help': 'use FILE as a cache for file hashes' }),
	584	('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
	585	'type': 'choice', 'choices': FMTMAP.keys(),
	586	'help': 'read files to report in the given FORMAT' }),
80d1feec MW	587	('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
80d1feec MW	588	'help': 'read diff from stdin, clear cache entries' }),
f6b4ffdc MW	589	('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
	590	##'type': 'choice', 'choices': H.algorithms,
	591	'help': 'use HASH as the hash function' })]:
	592	op.add_option(short, long, **props)
	593	opts, args = op.parse_args(argv)
	594
80d1feec MW	595	if opts.udiff:
	596	if opts.cache is None or opts.all or opts.files or len(args) > 2:
	597	die("incompatible options: `-u' requires `-c CACHE', forbids others")
	598	db = HashCache(opts.cache, opts.hash)
	599	if len(args) == 2: OS.chdir(args[1])
	600	good = True
	601	if not clear_cache(db): good = False
	602	if good: db.flush()
	603	else: exit(2)
	604	else:
	605	if not opts.files and len(args) <= 1:
	606	die("no filename sources: nothing to do")
	607	db = HashCache(opts.cache, opts.hash)
	608	if opts.all:
	609	db.reset()
	610	rep = Reporter(db)
	611	if opts.files:
	612	FMTMAP[opts.files](rep.file)
	613	for dir in args[1:]:
	614	enum_walk(dir, rep.file)
	615	if opts.all:
	616	db.prune()
	617	db.flush()
f6b4ffdc MW	618
f6b4ffdc MW	619	###----- That's all, folks --------------------------------------------------