mdw@git.distorted.org.uk Git - autoys/blob - gremlin/gremlin.in

   1 #! @PYTHON@
   2 ###
   3 ### Convert a directory tree of audio files
   4 ###
   5 ### (c) 2010 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `autoys' audio tools collection.
  11 ###
  12 ### `autoys' is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### `autoys' is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with `autoys'; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 ###--------------------------------------------------------------------------
  27 ### External dependencies.
  28
  29 ## Language features.
  30 from __future__ import with_statement
  31
  32 ## Standard Python libraries.
  33 import errno as E
  34 import fnmatch as FN
  35 import locale as LC
  36 import optparse as OP
  37 import os as OS
  38 import re as RX
  39 import sys as SYS
  40 import time as T
  41 import shlex as L
  42 import shutil as SH
  43 import threading as TH
  44 import unicodedata as UD
  45 from math import sqrt, ceil
  46 from contextlib import contextmanager
  47
  48 ## eyeD3 tag fettling.
  49 import eyed3 as E3
  50
  51 ## Gstreamer.
  52 import gi
  53 gi.require_version('GLib', '2.0'); from gi.repository import GLib as G
  54 gi.require_version('Gio', '2.0'); from gi.repository import Gio as GIO
  55 gi.require_version('Gst', '1.0'); from gi.repository import Gst as GS
  56 GS.init([])
  57
  58 ## Python Imaging.
  59 from PIL import Image as I
  60
  61 ## Python parsing.
  62 import pyparsing as P
  63
  64 ###--------------------------------------------------------------------------
  65 ### Special initialization.
  66
  67 VERSION = '@VERSION@'
  68
  69 ## GLib.
  70 G.threads_init()
  71
  72 ###--------------------------------------------------------------------------
  73 ### Eyecandy progress reports.
  74
  75 DEFAULT_ENCODING = None
  76
  77 def charwidth(s):
  78   """
  79   Return the width of S, in characters.
  80
  81   Specifically, this is the number of backspace characters required to
  82   overprint the string S.  If the current encoding for `stdout' appears to be
  83   Unicode then do a complicated Unicode thing; otherwise assume that
  84   characters take up one cell each.
  85
  86   None of this handles tab characters in any kind of useful way.  Sorry.
  87   """
  88
  89   global DEFAULT_ENCODING
  90
  91   ## Figure out the default encoding.
  92   if DEFAULT_ENCODING is None: DEFAULT_ENCODING = LC.getpreferredencoding()
  93
  94   ## Turn the string into Unicode so we can hack on it properly.  Maybe that
  95   ## won't work out, in which case fall back to being stupid.
  96   try: u = s.decode(DEFAULT_ENCODING)
  97   except UnicodeError: return len(s)
  98
  99   ## Our main problem is combining characters, but we should also try to
 100   ## handle wide (mostly Asian) characters, and zero-width ones.  This hack
 101   ## is taken mostly from http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 102   w = 0
 103   for ch in u:
 104     cd = ord(ch)
 105     if UD.category(ch) in ['Cf', 'Me', 'Mn'] or \
 106           0x1160 <= cd <= 0x11ff: pass
 107     elif UD.east_asian_width(ch) in ['F', 'W']: w += 2
 108     else: w += 1
 109
 110   ## Done.
 111   return w
 112
 113 class StatusLine (object):
 114   """
 115   Maintains a status line containing ephemeral progress information.
 116
 117   The status line isn't especially important, but it keeps interactive users
 118   amused.
 119
 120   There should be only one status line object in your program; otherwise
 121   they'll interfere with each other and get confused.
 122
 123   The update algorithm (in `set') is fairly careful to do the right thing
 124   with long status `lines', and to work properly in an Emacs `shell' buffer.
 125   """
 126
 127   def __init__(me):
 128     "Initialize the status line."
 129     me._last = ''
 130     me._lastlen = 0
 131     me.eyecandyp = OS.isatty(SYS.stdout.fileno())
 132
 133   def set(me, line):
 134     """
 135     Set the status line contents to LINE, replacing what was there before.
 136
 137     This only produces actual output if stdout is interactive.
 138     """
 139     n = len(line)
 140
 141     ## Eyecandy update.
 142     if me.eyecandyp:
 143
 144       ## If the old line was longer, we need to clobber its tail, so work out
 145       ## what that involves.
 146       if n < me._lastlen:
 147         b = charwidth(me._last[n:])
 148         pre = '\b'*b + ' '*b
 149       else:
 150         pre = ''
 151
 152       ## Now figure out the length of the common prefix between what we had
 153       ## before and what we have now.  This reduces the amount of I/O done,
 154       ## which keeps network traffic down on SSH links, and keeps down the
 155       ## amount of work slow terminal emulators like Emacs have to do.
 156       i = 0
 157       m = min(n, me._lastlen)
 158       while i < m and line[i] == me._last[i]:
 159         i += 1
 160
 161       ## Actually do the output, all in one syscall.
 162       b = charwidth(me._last[i:])
 163       SYS.stdout.write(pre + '\b'*b + line[i:])
 164       SYS.stdout.flush()
 165
 166     ## Update our idea of what's gone on.
 167     me._lastlen = n
 168     me._last = line
 169
 170   def clear(me):
 171     "Clear the status line.  Just like set('')."
 172     me.set('')
 173
 174   def commit(me, line = None):
 175     """
 176     Commit the current status line, and maybe the string LINE.
 177
 178     If the current status line is nonempty, then commit it to the transcript.
 179     If LINE is not None, then commit that to the transcript too.
 180
 181     After all of this, we clear the status line to get back to a clean state.
 182     """
 183     if me._last:
 184       if me.eyecandyp:
 185         SYS.stdout.write('\n')
 186       else:
 187         SYS.stdout.write(me._last + '\n')
 188     if line is not None:
 189       SYS.stdout.write(line + '\n')
 190     me._lastlen = 0
 191     me._last = ''
 192
 193 STATUS = StatusLine()
 194
 195 def filestatus(file, status):
 196   return '%s%s: %s' % (' '*8, OS.path.basename(file), status)
 197
 198 class ProgressEyecandy (object):
 199   """
 200   Provide amusement while something big and complicated is happening.
 201
 202   This is an abstract class.  Subclasses must provide a method `progress'
 203   returning a pair (CURRENT, MAX) indicating the current progress through the
 204   operation.
 205   """
 206
 207   def __init__(me, what, silentp = False):
 208     """
 209     Initialize a progress meter.
 210
 211     WHAT is a prefix string to be written before the progress eyecandy
 212     itself.
 213     """
 214     me._what = what
 215     me._silentp = silentp
 216     me._spinner = 0
 217     me._start = T.time()
 218
 219   def _fmt_time(me, t):
 220     "Format T as a time, in (maybe hours) minutes and seconds."
 221     s, t = t % 60, int(t/60)
 222     m, h = t % 60, int(t/60)
 223     if h > 0:
 224       return '%d:%02d:%02d' % (h, m, s)
 225     else:
 226       return '%02d:%02d' % (m, s)
 227
 228   def show(me):
 229     "Show the current level of progress."
 230
 231     ## If we're not showing pointless frippery, don't bother at all.
 232     if not STATUS.eyecandyp:
 233       return
 234
 235     ## Update the spinner index.
 236     me._spinner = (me._spinner + 1)%4
 237
 238     ## Fetch the current progress information.  Note that we always fetch
 239     ## both the current and maximum levels, because both might change if an
 240     ## operation revises its idea of how much work needs doing.
 241     cur, max = me.progress()
 242
 243     ## If we couldn't get progress information, display something vaguely
 244     ## amusing anyway.
 245     if cur is None or max is None:
 246       STATUS.set('%s %c [unknown progress]' %
 247                  (me._what, r'/-\|'[me._spinner]))
 248       return
 249
 250     ## Work out -- well, guess -- the time remaining.
 251     if cur:
 252       t = T.time()
 253       eta = me._fmt_time(ceil((t - me._start)*(max - cur)/cur))
 254     else:
 255       eta = '???'
 256
 257     ## Set the status bar.
 258     n = 40*cur/max
 259     STATUS.set('%s %c [%s%s] %3d%% (%s)' % \
 260                (me._what,
 261                 r'/-\|'[me._spinner],
 262                 '='*n, ' '*(40 - n),
 263                 100*cur/max,
 264                 eta))
 265
 266   def done(me, win = True):
 267     "Show a completion notice, or a failure if WIN is false."
 268     if not win:
 269       STATUS.set('%s FAILED!' % me._what)
 270     elif not me._silentp:
 271       STATUS.set('%s done (%s)' %
 272                  (me._what,
 273                   me._fmt_time(T.time() - me._start)))
 274     else:
 275       return
 276     STATUS.commit()
 277
 278 ###--------------------------------------------------------------------------
 279 ### Timeout handling.
 280
 281 KILLSWITCH = TH.Event()
 282
 283 def timeout(t0, t1):
 284   T.sleep(t0)
 285   KILLSWITCH.set()
 286   T.sleep(t1)
 287   moan('dying messily due to timeout')
 288   OS._exit(3)
 289
 290 ###--------------------------------------------------------------------------
 291 ### Parsing utilities.
 292
 293 ## Allow hyphens in identifiers.
 294 IDCHARS = P.alphanums + '-_'
 295 P.Keyword.setDefaultKeywordChars(IDCHARS)
 296
 297 ## Some common kinds of tokens.
 298 Name = P.Word(IDCHARS)
 299 Num = P.Word(P.nums).setParseAction(lambda toks: map(int, toks))
 300 String = P.QuotedString('"', '\\')
 301
 302 ## Handy abbreviations for constructed parser elements.
 303 def K(k): return P.Keyword(k).suppress()
 304 def D(d): return P.Literal(d).suppress()
 305 def R(p): return P.ZeroOrMore(p).setParseAction(lambda s, l, t: [t])
 306 O = P.Optional
 307
 308 ###--------------------------------------------------------------------------
 309 ### Format identification and conversion.
 310
 311 class IdentificationFailure (Exception):
 312   pass
 313
 314 class FileCategory (object):
 315   """
 316   A FileCategory represents a class of files.
 317
 318   For example, it's sensible to consider audio, or image files as a
 319   category.  A file category knows how to recognize member files from
 320   MIME content types.
 321   """
 322
 323   def __init__(me, name, mime_pats, ident):
 324     """
 325     Construct a new category.
 326
 327     The PATS are a list of `fnmatch' patterns to be compared with a MIME
 328     type.  The IDENT is a function which produces an identification object
 329     given a file's name and first-guess MIME type.  The object is passed to a
 330     Format's `check' method to see whether a file needs re-encoding, and to
 331     `convert' to assist with the conversion.
 332
 333     An identification object must have an attribute `mime' which is a set of
 334     possible MIME types accumulated for the object.
 335     """
 336     me.name = name
 337     me._mime_pats = mime_pats
 338     me._ident = ident
 339     CATEGORYMAP[name] = me
 340
 341   def identify(me, file, mime):
 342     """
 343     Attempt to identify FILE, given its apparent MIME type.
 344
 345     If identification succeeds, return an identification object which can be
 346     used by associated file formats; otherwise return None.
 347     """
 348     for p in me._mime_pats:
 349       if not FN.fnmatchcase(mime, p):
 350         continue
 351       try:
 352         return me._ident(file, mime)
 353       except IdentificationFailure:
 354         pass
 355     return None
 356
 357 class BaseFormat (object):
 358   """
 359   A BaseFormat object represents a particular encoding and parameters.
 360
 361   The object can verify (the `check' method) whether a particular file
 362   matches its requirements, and if necessary (`encode') re-encode a file.
 363
 364   Subclasses should define the following methods.
 365
 366   check(ID)
 367           Answer whether the file identified by ID is acceptable according to
 368           the receiver's parameters.
 369
 370   convert(MASTER, ID, TARGET)
 371           Convert the file MASTER, which has been identified as ID, according
 372           to the receiver's parameters, writing the output to TARGET.
 373
 374   Subclasses should also provide these attributes.
 375
 376   CATEGORY
 377           A FileCategory object for the category of files that this format
 378           lives within.
 379
 380   EXT     A file extension to be applied to encoded output files.
 381
 382   NAME    A user-facing name for the format.
 383
 384   PROPS   A parser element to parse a property definition.  It should produce
 385           a pair NAME, VALUE to be stored in a dictionary.
 386
 387   Subclasses for different kinds of file may introduce more subclass
 388   protocol.
 389   """
 390
 391   def fixup(me, path):
 392     """Post-encoding fixups."""
 393     pass
 394
 395 FORMATMAP = {}
 396 CATEGORYMAP = {}
 397
 398 def defformat(name, cls):
 399   "Define a format NAME using class CLS."
 400   if not hasattr(cls, 'NAME'):
 401     raise ValueError, 'abstract class'
 402   if not hasattr(cls, 'CATEGORY'):
 403     raise ValueError, 'no category'
 404   FORMATMAP[name] = cls
 405
 406 class FormatParser (P.ParserElement):
 407   """
 408   Parse a format specifier:
 409
 410   format-spec ::= string [format-properties]
 411   format-properties ::= `{' format-property (`,' format-property)* `}'
 412
 413   The syntax of a format-property is determined by the PROPS attribute on the
 414   named format and its superclasses.
 415   """
 416
 417   name = 'format-spec'
 418
 419   ## We cache the parser elements we generate to avoid enormous consing.
 420   CACHE = {}
 421
 422   def parseImpl(me, s, loc, actp = True):
 423
 424     ## Firstly, determine the format name.
 425     loc, r = Name._parse(s, loc, actp)
 426     fmt = r[0]
 427
 428     ## Look up the format class.
 429     try: fcls = FORMATMAP[fmt]
 430     except KeyError:
 431       raise P.ParseException(s, loc, "Unknown format `%s'" % fmt)
 432
 433     ## Fetch the property-list parser from the cache, if possible; else
 434     ## construct it.
 435     try:
 436       pp = me.CACHE[fmt]
 437     except KeyError:
 438       seen = set()
 439       prop = None
 440       for c in fcls.mro():
 441         try: p = c.PROPS
 442         except AttributeError: continue
 443         if p in seen: continue
 444         if prop is None: prop = p
 445         else: prop |= p
 446         seen.add(p)
 447       if prop is None:
 448         pp = me.CACHE[fmt] = None
 449       else:
 450         props = P.delimitedList(prop)
 451         props.setParseAction(lambda s, l, t: dict(t.asList()))
 452         pp = me.CACHE[fmt] = O(D('{') - props - D('}'))
 453
 454     ## Parse the properties.
 455     if pp is None:
 456       pd = {}
 457     else:
 458       loc, r = pp._parse(s, loc, actp)
 459       if r: pd = r[0]
 460       else: pd = {}
 461
 462     ## Construct the format object and return it.
 463     return loc, fcls(**pd)
 464
 465 Format = FormatParser()
 466
 467 def prop(kw, pval, tag = None):
 468   if tag is None: tag = kw
 469   if pval is None:
 470     p = K(kw)
 471     p.setParseAction(lambda s, l, t: (tag, True))
 472   else:
 473     p = K(kw) + D('=') + pval
 474     p.setParseAction(lambda s, l, t: (tag, t[0]))
 475   return p
 476
 477 ###--------------------------------------------------------------------------
 478 ### Policies and actions.
 479
 480 class Action (object):
 481   """
 482   An Action object represents a conversion action to be performed.
 483
 484   This class isn't intended to be instantiated directly.  It exists to define
 485   some protocol common to all Action objects.
 486
 487   Action objects have the following attributes.
 488
 489   master        The name of the master (source) file.
 490
 491   target        The name of the target (destination) file.
 492
 493   PRIORITY      The priority of the action, for deciding which of two actions
 494                 to perform.  Higher priorities are more likely to win.
 495
 496   Converting an Action to a string describes the action in a simple
 497   user-readable manner.  The `perform' method actually carries the action
 498   out.
 499   """
 500
 501   PRIORITY = 0
 502
 503   def __init__(me, master):
 504     "Stash the MASTER file name for later."
 505     me.master = master
 506
 507   def choose(me, him):
 508     "Choose either ME or HIM and return one."
 509     if him is None or me.PRIORITY > him.PRIORITY:
 510       return me
 511     else:
 512       return him
 513
 514 class CopyAction (Action):
 515   """
 516   An Action object for simply copying a file.
 517
 518   Actually we try to hardlink it first, falling back to a copy later.  This
 519   is both faster and more efficient with regard to disk space.
 520   """
 521
 522   ## Copying is good.  Linking is really good, but we can't tell the
 523   ## difference at this stage.
 524   PRIORITY = 10
 525
 526   def __init__(me, master, targetdir):
 527     "Initialize a CopyAction, from MASTER to the TARGETDIR directory."
 528     Action.__init__(me, master)
 529     me.target = OS.path.join(targetdir, OS.path.basename(master))
 530
 531   def __str__(me):
 532     return 'copy/link'
 533
 534   def perform(me):
 535     "Actually perform a CopyAction."
 536     try:
 537       STATUS.set(filestatus(me.master, 'link'))
 538       OS.link(me.master, me.target)
 539     except OSError, err:
 540       if err.errno != E.EXDEV:
 541         raise
 542       STATUS.set(filestatus(me.master, 'copy'))
 543       new = me.target + '.new'
 544       SH.copyfile(me.master, new)
 545       OS.rename(new, me.target)
 546     STATUS.commit()
 547
 548 class ConvertAction (Action):
 549   """
 550   An Action object for converting a file to a given format.
 551
 552   Additional attributes:
 553
 554   id            The identification object for the master file.
 555
 556   format        The format to which we're meant to conver the master.
 557   """
 558
 559   def __init__(me, master, targetdir, id, format):
 560     "Initialize a ConvertAction."
 561     Action.__init__(me, master)
 562     stem, ext = OS.path.splitext(OS.path.basename(master))
 563     me.target = OS.path.join(targetdir, stem + '.' + format.EXT)
 564     me.id = id
 565     me.format = format
 566
 567   def __str__(me):
 568     return 'convert to %s' % me.format.NAME
 569
 570   def perform(me):
 571     "Acually perform a ConvertAction."
 572     STATUS.set(filestatus(me.master, me))
 573     me.format.convert(me.master, me.id, me.target)
 574
 575 Policy = P.Forward()
 576
 577 class FormatPolicy (object):
 578   """
 579   A FormatPolicy object represents a set of rules for how to convert files.
 580
 581   Given a master file, the FormatPolicy will identify it and return a list of
 582   actions to be performed.  The methods required of a FormatPolicy are:
 583
 584   setcategory(CAT)
 585           Store CAT as the policy's category.  Check that this is consistent
 586           with the policy as stored.
 587
 588   actions(MASTER, TARGETDIR, ID, COHORT)
 589           Given a MASTER file, identified as ID, a target directory
 590           TARGETDIR, and a list COHORT of (FILE, ID) pairs for other files
 591           of the same category in the same directory, return a list of
 592           actions to be performed to get the target directory into the right
 593           form.  The list might be empty if the policy object /rejects/ the
 594           file.
 595   """
 596
 597 class AndPolicy (FormatPolicy):
 598   """
 599   A FormatPolicy which does the union of a bunch of other policies.
 600
 601   Each subsidiary policy is invoked in turn.  The highest-priority action for
 602   each target file is returned.
 603   """
 604
 605   def __init__(me, policies):
 606     me._policies = policies
 607
 608   def setcategory(me, cat):
 609     me.cat = cat
 610     for p in me._policies:
 611       p.setcategory(cat)
 612
 613   def actions(me, master, targetdir, id, cohort):
 614     tmap = {}
 615     for p in me._policies:
 616       for a in p.actions(master, targetdir, id, cohort):
 617         if a.target in tmap:
 618           tmap[a.target] = a.choose(tmap.get(a.target))
 619         else:
 620           tmap[a.target] = a
 621     return tmap.values()
 622
 623 And = K('and') - D('{') - R(Policy) - D('}')
 624 And.setParseAction(lambda s, l, t: AndPolicy(t[0]))
 625
 626 class OrPolicy (FormatPolicy):
 627   """
 628   A FormatPolicy which tries other policies and uses the first that accepts.
 629
 630   Each subsidiary policy is invoked in turn.  If any accepts, the actions it
 631   proposes are turned and no further policies are invoked.  If none accepts
 632   then the file is rejected.
 633   """
 634
 635   def __init__(me, policies):
 636     me._policies = policies
 637
 638   def setcategory(me, cat):
 639     me.cat = cat
 640     for p in me._policies:
 641       p.setcategory(cat)
 642
 643   def actions(me, master, targetdir, id, cohort):
 644     for p in me._policies:
 645       aa = p.actions(master, targetdir, id, cohort)
 646       if aa:
 647         return aa
 648     else:
 649       return []
 650
 651 Or = K('or') - D('{') - R(Policy) - D('}')
 652 Or.setParseAction(lambda s, l, t: OrPolicy(t[0]))
 653
 654 class AcceptPolicy (FormatPolicy):
 655   """
 656   A FormatPolicy which copies files in a particular format.
 657
 658   If all of the files in a cohort are recognized as being in a particular
 659   format (including this one), then accept it with a CopyAction; otherwise
 660   reject.
 661   """
 662
 663   def __init__(me, format):
 664     me._format = format
 665
 666   def setcategory(me, cat):
 667     if me._format.CATEGORY is not cat:
 668       raise ValueError, \
 669             "Accept format `%s' has category `%s', not `%s'" % \
 670             (me._format.__class__.__name__,
 671              me._format.CATEGORY.name, cat.name)
 672     me.cat = cat
 673
 674   def actions(me, master, targetdir, id, cohort):
 675     if me._format.check(id) and \
 676        all(me._format.check(cid) for f, cid in cohort):
 677       return [CopyAction(master, targetdir)]
 678     else:
 679       return []
 680
 681 Accept = K('accept') - Format
 682 Accept.setParseAction(lambda s, l, t: AcceptPolicy(t[0]))
 683
 684 class ConvertPolicy (FormatPolicy):
 685   """
 686   A FormatPolicy which copies files in a particular format or converts if
 687   necessary.
 688   """
 689   def __init__(me, format):
 690     me._format = format
 691
 692   def setcategory(me, cat):
 693     if me._format.CATEGORY is not cat:
 694       raise ValueError, \
 695             "Accept format `%s' has category `%s', not `%s'" % \
 696             (me._format.__class__.__name__,
 697              me._format.CATEGORY.name, cat.name)
 698     me.cat = cat
 699
 700   def actions(me, master, targetdir, id, cohort):
 701     if me._format.check(id):
 702       return [CopyAction(master, targetdir)]
 703     else:
 704       return [ConvertAction(master, targetdir, id, me._format)]
 705
 706 Convert = K('convert') - Format
 707 Convert.setParseAction(lambda s, l, t: ConvertPolicy(t[0]))
 708
 709 Policy << (And | Or | Accept | Convert)
 710
 711 ###--------------------------------------------------------------------------
 712 ### Audio handling, based on GStreamer.
 713
 714 def make_element(factory, name = None, **props):
 715   "Return a new element from the FACTORY with the given NAME and PROPS."
 716   elt = GS.ElementFactory.make(factory, name)
 717   if elt is None: raise ValueError, 'failed to make `%s\' element' % factory
 718   elt.set_properties(**props)
 719   return elt
 720
 721 def link_elements(elts):
 722   "Link the elements ELTS together, in order."
 723   e0 = None
 724   for e1 in elts:
 725     if e0 is not None: e0.link(e1)
 726     e0 = e1
 727
 728 def bin_children(bin):
 729   "Iterate over the (direct) children of a BIN."
 730   iter = bin.iterate_elements()
 731   while True:
 732     rc, elt = iter.next()
 733     if rc == GS.IteratorResult.DONE: break
 734     elif rc != GS.IteratorResult.OK:
 735       raise ValueError, 'iteration failed (%s)' % rc
 736     else: yield elt
 737
 738 class GStreamerProgressEyecandy (ProgressEyecandy):
 739   """
 740   Provide amusement while GStreamer is busy doing something.
 741
 742   The GStreamerProgressEyecandy object is a context manager.  Wrap it round
 743   your GStreamer loop to provide progress information for an operation.
 744   """
 745
 746   def __init__(me, what, elt, **kw):
 747     """
 748     Initialize a progress meter.
 749
 750     WHAT is a prefix string to be written before the progress eyecandy
 751     itself.  ELT is a GStreamer element to interrogate to find the progress
 752     information.
 753     """
 754     me._elt = elt
 755     ProgressEyecandy.__init__(me, what, **kw)
 756
 757   def _update(me):
 758     "Called by GLib main event loop to update the eyecandy."
 759     me.show()
 760     return True
 761
 762   def _timer(me):
 763     """
 764     Update the progress meter.
 765
 766     This is called periodically by the GLib main event-processing loop.
 767     """
 768     me.show()
 769     return True
 770
 771   def progress(me):
 772     "Return the current progress as a pair (CURRENT, MAX)."
 773
 774     ## Fetch the current progress information.  We get the duration each
 775     ## time, because (particularly with VBR-encoded MP3 inputs) the estimated
 776     ## duration can change as we progress.  Hopefully it settles down fairly
 777     ## soon.
 778     ok, t = me._elt.query_position(GS.Format.TIME)
 779     if ok: ok, end = me._elt.query_duration(GS.Format.TIME)
 780     if ok: return t, end
 781     else: return None, None
 782
 783   def __enter__(me):
 784     "Enter context: attach progress meter display."
 785
 786     ## If we're not showing pointless frippery, don't bother at all.
 787     if not STATUS.eyecandyp:
 788       return
 789
 790     ## Update regularly.  The pipeline runs asynchronously.
 791     me._id = G.timeout_add(100, me._update)
 792
 793   def __exit__(me, ty, val, tb):
 794     "Leave context: remove display and report completion or failure."
 795
 796     ## If we're not showing pointless frippery, there's nothing to remove.
 797     if STATUS.eyecandyp:
 798       G.source_remove(me._id)
 799
 800     ## Report completion anyway.
 801     me.done(ty is None)
 802
 803     ## As you were.
 804     return False
 805
 806 class AudioIdentifier (object):
 807   """
 808   Analyses and identifies an audio file.
 809
 810   Important properties are:
 811
 812   cap     A capabilities structure describing the audio file data.  The most
 813           interesting thing in here is probably its name, which is a MIME
 814           type describing the data.
 815
 816   dcap    A capabilities structure describing the decoded audio data.  This
 817           is of interest during conversion.
 818
 819   tags    A dictionary containing metadata tags from the file.  These are in
 820           GStreamer's encoding-independent format.
 821
 822   bitrate An approximation to the stream's bitrate, in kilobits per second.
 823           This might be slow to work out for some files so it's computed on
 824           demand.
 825   """
 826
 827   def _prepare_pipeline(me):
 828     pipe = GS.Pipeline()
 829     bus = pipe.get_bus()
 830
 831     ## The basic recognition kit is based around `decodebin'.  We must keep
 832     ## it happy by giving it sinks for the streams it's found, which it
 833     ## announces asynchronously.
 834     source = make_element('filesrc', 'file', location = me._file)
 835     decoder = make_element('decodebin', 'decode')
 836     sink = make_element('fakesink')
 837     def decoder_pad_arrived(elt, pad):
 838       if pad.get_current_caps()[0].get_name().startswith('audio/'):
 839         elt.link_pads(pad.get_name(), sink, 'sink')
 840     decoder.connect('pad-added', decoder_pad_arrived)
 841     for i in [source, decoder, sink]: pipe.add(i)
 842     link_elements([source, decoder])
 843
 844     ## Done.
 845     return pipe, bus, decoder, sink
 846
 847   def __init__(me, file, mime):
 848     "Initialize the object suitably for identifying FILE."
 849
 850     me._file = file
 851     pipe, bus, decoder, sink = me._prepare_pipeline()
 852
 853     ## Make some initial GStreamer objects.  We'll want the pipeline later if
 854     ## we need to analyse a poorly tagged MP3 stream, so save it away.
 855     loop = G.MainLoop()
 856
 857     ## Arrange to collect tags from the pipeline's bus as they're reported.
 858     tags = {}
 859     fail = []
 860     def bus_message(bus, msg):
 861       ty, s = msg.type, msg.get_structure()
 862       if ty == GS.MessageType.ERROR:
 863         fail[:] = (ValueError, s['debug'], None)
 864         loop.quit()
 865       elif ty == GS.MessageType.STATE_CHANGED:
 866         if s['new-state'] == GS.State.PAUSED and \
 867                msg.src == pipe:
 868           loop.quit()
 869       elif ty == GS.MessageType.TAG:
 870         tt = s['taglist']
 871         for i in xrange(tt.n_tags()):
 872           t = tt.nth_tag_name(i)
 873           if tt.get_tag_size(t) != 1: continue
 874           v = tt.get_value_index(t, 0)
 875           tags[t] = v
 876     bmid = bus.connect('message', bus_message)
 877
 878     ## We want to identify the kind of stream this is.  (Hmm.  The MIME type
 879     ## recognizer has already done this work, but GStreamer is probably more
 880     ## reliable.)  The `decodebin' has a `typefind' element inside which will
 881     ## announce the identified media type.  All we need to do is find it and
 882     ## attach a signal handler.  (Note that the handler might be run in the
 883     ## thread context of the pipeline element, but Python's GIL will keep
 884     ## things from being too awful.)
 885     me.cap = None
 886     me.dcap = None
 887     for e in bin_children(decoder):
 888       if e.get_factory().get_name() == 'typefind':
 889         tfelt = e
 890         break
 891     else:
 892       assert False, 'failed to find typefind element'
 893
 894     ## Crank up most of the heavy machinery.  The message handler will stop
 895     ## the loop when things seem to be sufficiently well underway.
 896     bus.add_signal_watch()
 897     pipe.set_state(GS.State.PAUSED)
 898     loop.run()
 899     bus.disconnect(bmid)
 900     bus.remove_signal_watch()
 901     if fail:
 902       pipe.set_state(GS.State.NULL)
 903       raise fail[0], fail[1], fail[2]
 904
 905     ## Store the collected tags.
 906     me.tags = tags
 907
 908     ## Gather the capabilities.  The `typefind' element knows the input data
 909     ## type.  The 'decodebin' knows the raw data type.
 910     me.cap = tfelt.get_static_pad('src').get_allowed_caps()[0]
 911     me.mime = set([mime, me.cap.get_name()])
 912     me.dcap = sink.get_static_pad('sink').get_allowed_caps()[0]
 913
 914     ## If we found a plausible bitrate then stash it.  Otherwise note that we
 915     ## failed.  If anybody asks then we'll work it out then.
 916     if 'nominal-bitrate' in tags:
 917       me._bitrate = tags['nominal-bitrate']/1000
 918     elif 'bitrate' in tags and tags['bitrate'] >= 80000:
 919       me._bitrate = tags['bitrate']/1000
 920     else:
 921       ok, n = pipe.query_duration(GS.Format.BYTES)
 922       if ok: ok, t = pipe.query_duration(GS.Format.TIME)
 923       if ok: me._bitrate = int((8e6*n)/t)
 924       else: me._bitrate = None
 925     pipe.set_state(GS.State.NULL)
 926
 927   @property
 928   def bitrate(me):
 929     """
 930     Return the approximate bit-rate of the input file.
 931
 932     This might take a while if we have to work it out the hard way.
 933     """
 934
 935     ## If we already know the answer then just return it.
 936     if me._bitrate is not None:
 937       return me._bitrate
 938
 939     ## Make up a new pipeline and main loop.
 940     pipe, bus, _, _ = me._prepare_pipeline()
 941     loop = G.MainLoop()
 942
 943     ## Watch for bus messages.  We'll stop when we reach the end of the
 944     ## stream: then we'll have a clear idea of how long the track was.
 945     fail = []
 946     def bus_message(bus, msg):
 947       ty, s = msg.type, msg.get_structure()
 948       if ty == GS.MessageType.ERROR:
 949         fail[:] = (ValueError, s['debug'], None)
 950         loop.quit()
 951       elif ty == GS.MessageType.EOS:
 952         loop.quit()
 953     bus = pipe.get_bus()
 954     bmid = bus.connect('message', bus_message)
 955
 956     ## Get everything moving, and keep the user amused while we work.
 957     bus.add_signal_watch()
 958     pipe.set_state(GS.State.PLAYING)
 959     with GStreamerProgressEyecandy(filestatus(me._file, 'measure bitrate'),
 960                                    pipe, silentp = True):
 961       loop.run()
 962     bus.remove_signal_watch()
 963     bus.disconnect(bmid)
 964     if fail:
 965       pipe.set_state(GS.State.NULL)
 966       raise fail[0], fail[1], fail[2]
 967     STATUS.clear()
 968
 969     ## The bitrate computation wants the file size.  Ideally we'd want the
 970     ## total size of the frames' contents, but that seems hard to dredge
 971     ## out.  If the framing overhead is small, this should be close enough
 972     ## for our purposes.
 973     bytes = OS.stat(me._file).st_size
 974
 975     ## Now we should be able to find out our position accurately and work out
 976     ## a bitrate.  Cache it in case anybody asks again.
 977     ok, t = pipe.query_position(GS.Format.TIME)
 978     assert ok, 'failed to discover bitrate'
 979     me._bitrate = int(8*bytes*1e6/t)
 980     pipe.set_state(GS.State.NULL)
 981
 982     ## Done.
 983     return me._bitrate
 984
 985 class AudioFormat (BaseFormat):
 986   """
 987   An AudioFormat is a kind of Format specialized for audio files.
 988
 989   Format checks are done on an AudioIdentifier object.
 990   """
 991
 992   PROPS = prop('bitrate', Num)
 993
 994   ## libmagic reports `application/ogg' for Ogg Vorbis files.  We've switched
 995   ## to GIO now, which reports either `audio/ogg' or `audio/x-vorbis+ogg'
 996   ## depending on how thorough it's trying to be.  Still, it doesn't do any
 997   ## harm here; the main risk is picking up Ogg Theora files by accident, and
 998   ## we'll probably be able to extract the audio from them anyway.
 999   CATEGORY = FileCategory('audio', ['audio/*', 'application/ogg'],
1000                           AudioIdentifier)
1001
1002   def __init__(me, bitrate = None):
1003     "Construct an object, requiring an approximate bitrate."
1004     me.bitrate = bitrate
1005
1006   def check(me, id):
1007     """
1008     Return whether the AudioIdentifier ID is suitable for our purposes.
1009
1010     Subclasses can either override this method or provide a property
1011     `MIMETYPES', which is a list (other thing that implements `__contains__')
1012     of GStreamer MIME types matching this format.
1013     """
1014     return id.mime & me.MIMETYPES and \
1015            (me.bitrate is None or id.bitrate <= me.bitrate * sqrt(2))
1016
1017   def encoder(me):
1018     """
1019     Constructs a GStreamer element to encode audio input.
1020
1021     Subclasses can either override this method (or replace `encode'
1022     entirely), or provide a method `encoder_chain' which returns a list of
1023     elements to be linked together in sequence.  The first element in the
1024     chain must have a pad named `sink' and the last must have a pad named
1025     `src'.
1026     """
1027     elts = me.encoder_chain()
1028     bin = GS.Bin()
1029     for i in elts: bin.add(i)
1030     link_elements(elts)
1031     bin.add_pad(GS.GhostPad('sink', elts[0].get_static_pad('sink')))
1032     bin.add_pad(GS.GhostPad('src', elts[-1].get_static_pad('src')))
1033     return bin
1034
1035   def convert(me, master, id, target):
1036     """
1037     Encode audio from MASTER, already identified as ID, writing it to TARGET.
1038
1039     See `encoder' for subclasses' responsibilities.
1040     """
1041
1042     ## Construct the necessary equipment.
1043     pipe = GS.Pipeline()
1044     bus = pipe.get_bus()
1045     loop = G.MainLoop()
1046
1047     ## Make sure that there isn't anything in the way of our output.  We're
1048     ## going to write to a scratch file so that we don't get confused by
1049     ## half-written rubbish left by a crashed program.
1050     new = target + '.new'
1051     try:
1052       OS.unlink(new)
1053     except OSError, err:
1054       if err.errno != E.ENOENT:
1055         raise
1056
1057     ## Piece together our pipeline.  The annoying part is that the
1058     ## `decodebin' doesn't have any source pads yet, so our chain is in two
1059     ## halves for now.
1060     source = make_element('filesrc', 'source', location = master)
1061     decoder = make_element('decodebin', 'decode')
1062     convert = make_element('audioconvert', 'convert')
1063     encoder = me.encoder()
1064     sink = make_element('filesink', 'sink', location = new)
1065     for i in [source, decoder, convert, encoder, sink]: pipe.add(i)
1066     link_elements([source, decoder])
1067     link_elements([convert, encoder, sink])
1068
1069     ## Some decoders (e.g., the AC3 decoder) include channel-position
1070     ## indicators in their output caps.  The Vorbis encoder interferes with
1071     ## this, and you end up with a beautifully encoded mono signal from a
1072     ## stereo source.  From a quick butchers at the `vorbisenc' source, I
1073     ## /think/ that this is only a problem with stereo signals: mono signals
1074     ## are mono already, and `vorbisenc' accepts channel positions if there
1075     ## are more than two channels.
1076     ##
1077     ## So we have this bodge.  We already collected the decoded audio caps
1078     ## during identification.  So if we see 2-channel audio with channel
1079     ## positions, we strip the positions off forcibly by adding a filter.
1080     if id.dcap.get_name().startswith('audio/x-raw-') and \
1081        id.dcap.has_field('channels') and \
1082        id.dcap['channels'] == 2 and \
1083        id.dcap.has_field('channel-positions'):
1084       dcap = GS.Caps()
1085       c = id.dcap.copy()
1086       c.remove_field('channel-positions')
1087       dcap.append(c)
1088     else:
1089       dcap = None
1090
1091     ## Hook onto the `decodebin' so we can link together the two halves of
1092     ## our encoding chain.  For now, we'll hope that there's only one audio
1093     ## stream in there, and just throw everything else away.
1094     def decoder_pad_arrived(elt, pad):
1095       if pad.get_current_caps()[0].get_name().startswith('audio/'):
1096         if dcap:
1097           elt.link_pads_filtered(pad.get_name(), convert, 'sink', dcap)
1098         else:
1099           elt.link_pads(pad.get_name(), convert, 'sink')
1100     decoder.connect('pad-added', decoder_pad_arrived)
1101
1102     ## Watch the bus for completion messages.
1103     fail = []
1104     def bus_message(bus, msg):
1105       if msg.type == GS.MessageType.ERROR:
1106         fail[:] = (ValueError, msg.get_structure()['debug'], None)
1107         loop.quit()
1108       elif msg.type == GS.MessageType.EOS:
1109         loop.quit()
1110     bmid = bus.connect('message', bus_message)
1111
1112     ## Get everything ready and let it go.
1113     bus.add_signal_watch()
1114     pipe.set_state(GS.State.PLAYING)
1115     with GStreamerProgressEyecandy(filestatus(master,
1116                                               'convert to %s' % me.NAME),
1117                                    pipe):
1118       loop.run()
1119     pipe.set_state(GS.State.NULL)
1120     bus.remove_signal_watch()
1121     bus.disconnect(bmid)
1122     if fail:
1123       raise fail[0], fail[1], fail[2]
1124
1125     ## Fix up the output file if we have to.
1126     me.fixup(new)
1127
1128     ## We're done.
1129     OS.rename(new, target)
1130
1131 class OggVorbisFormat (AudioFormat):
1132   "AudioFormat object for Ogg Vorbis."
1133
1134   ## From https://en.wikipedia.org/wiki/Vorbis
1135   QMAP = [(-1,  45), ( 0,  64), ( 1,  80), ( 2,  96),
1136           ( 3, 112), ( 4, 128), ( 5, 160), ( 6, 192),
1137           ( 7, 224), ( 8, 256), ( 9, 320), (10, 500)]
1138
1139   NAME = 'Ogg Vorbis'
1140   MIMETYPES = set(['application/ogg', 'audio/x-vorbis', 'audio/ogg',
1141                    'audio/x-vorbis+ogg'])
1142   EXT = 'ogg'
1143
1144   def encoder_chain(me):
1145     encprops = {}
1146     if me.bitrate is not None:
1147       for q, br in me.QMAP:
1148         if br >= me.bitrate:
1149           break
1150       else:
1151         raise ValueError, 'no suitable quality setting found'
1152       encprops['quality'] = q/10.0
1153     return [make_element('vorbisenc', **encprops),
1154             make_element('oggmux')]
1155
1156 defformat('ogg-vorbis', OggVorbisFormat)
1157
1158 class MP3Format (AudioFormat):
1159   "AudioFormat object for MP3."
1160
1161   NAME = 'MP3'
1162   MIMETYPES = set(['audio/mpeg'])
1163   EXT = 'mp3'
1164
1165   def encoder_chain(me):
1166     encprops = {}
1167     if me.bitrate is not None:
1168       encprops['bitrate'] = me.bitrate
1169       encprops['target'] = 'bitrate'
1170     else:
1171       encprops['quality'] = 4
1172       encprops['target'] = 'quality'
1173     return [make_element('lamemp3enc', quality = 4, **encprops),
1174             make_element('xingmux'),
1175             make_element('id3v2mux')]
1176
1177   def fixup(me, path):
1178     """
1179     Fix up MP3 files.
1180
1181     GStreamer produces ID3v2 tags, but not ID3v1.  This seems unnecessarily
1182     unkind to stupid players.
1183     """
1184     f = E3.load(path)
1185     if f is None: return
1186     t = f.tag
1187     if t is None: return
1188     for v in [E3.id3.ID3_V2_3, E3.id3.ID3_V1]:
1189       try: f.tag.save(version = v)
1190       except (UnicodeEncodeError,
1191               E3.id3.GenreException,
1192               E3.id3.TagException):
1193         pass
1194
1195 defformat('mp3', MP3Format)
1196
1197 ###--------------------------------------------------------------------------
1198 ### Image handling, based on the Python Imaging Library.
1199
1200 class ImageIdentifier (object):
1201   """
1202   Analyses and identifies an image file.
1203
1204   Simply leaves an Image object in the `img' property which can be inspected.
1205   """
1206
1207   def __init__(me, file, mime):
1208
1209     ## Get PIL to open the file.  It will magically work out what kind of
1210     ## file it is.
1211     try:
1212       me.img = I.open(file)
1213     except IOError, exc:
1214
1215       ## Unhelpful thing to raise on identification failure.  We can
1216       ## distinguish this from an actual I/O error because it doesn't have an
1217       ## `errno'.
1218       if exc.errno is None:
1219         raise IdentificationFailure
1220       raise
1221
1222     me.mime = set([mime])
1223
1224 class ImageFormat (BaseFormat):
1225   """
1226   An ImageFormat is a kind of Format specialized for image files.
1227
1228   Subclasses don't need to provide anything other than the properties
1229   required by all concrete Format subclasses.  However, there is a
1230   requirement that the `NAME' property match PIL's `format' name for the
1231   format.
1232   """
1233
1234   PROPS = prop('size', Num)
1235   CATEGORY = FileCategory('image', ['image/*'], ImageIdentifier)
1236
1237   def __init__(me, size = None, **kw):
1238     """
1239     Initialize an ImageFormat object.
1240
1241     Additional keywords are used when encoding, and may be recognized by
1242     enhanced `check' methods in subclasses.
1243     """
1244     me._size = size
1245     me._props = kw
1246
1247   def check(me, id):
1248     "Check whether the ImageIdentifier ID matches our requirements."
1249     return id.img.format == me.NAME and \
1250            (me._size is None or
1251             (id.img.size[0] <= me._size and
1252              id.img.size[1] <= me._size))
1253
1254   def convert(me, master, id, target):
1255     "Encode the file MASTER, identified as ID, writing the result to TARGET."
1256
1257     ## Write to a scratch file.
1258     new = target + '.new'
1259
1260     ## The ImageIdentifier already contains a copy of the open file.  It
1261     ## would be wasteful not to use it.
1262     img = id.img
1263     STATUS.set(filestatus(master, 'convert to %s' % me.NAME))
1264
1265     ## If there's a stated maximum size then scale the image down to match.
1266     ## But thumbnailing clobbers the original, so take a copy.
1267     if me._size is not None and \
1268            (img.size[0] > me._size or img.size[1] > me._size):
1269       img = img.copy()
1270       img.thumbnail((me._size, me._size), I.ANTIALIAS)
1271
1272     ## Write the output image.
1273     img.save(new, me.NAME, **me._props)
1274
1275     ## Fix it up if necessary.
1276     me.fixup(new)
1277
1278     ## We're done.
1279     OS.rename(new, target)
1280     STATUS.commit()
1281
1282 class JPEGFormat (ImageFormat):
1283   """
1284   Image format for JPEG (actually JFIF) files.
1285
1286   Interesting properties to set:
1287
1288   optimize
1289           If present, take a second pass to select optimal encoder settings.
1290
1291   progressive
1292           If present, make a progressive file.
1293
1294   quality Integer from 1--100 (worst to best); default is 75.
1295   """
1296   EXT = 'jpg'
1297   NAME = 'JPEG'
1298   PROPS = prop('optimize', None) \
1299     | prop('progressive', None, 'progression') \
1300     | prop('quality', Num)
1301
1302 defformat('jpeg', JPEGFormat)
1303
1304 class PNGFormat (ImageFormat):
1305   """
1306   Image format for PNG files.
1307
1308   Interesting properties:
1309
1310   optimize
1311           If present, make a special effort to minimize the output file.
1312   """
1313   EXT = 'png'
1314   NAME = 'PNG'
1315   PROPS = prop('optimize', None)
1316
1317 defformat('png', PNGFormat)
1318
1319 class BMPFormat (ImageFormat):
1320   """
1321   Image format for Windows BMP files, as used by RockBox.
1322
1323   No additional properties.
1324   """
1325   NAME = 'BMP'
1326   EXT = 'bmp'
1327
1328 defformat('bmp', BMPFormat)
1329
1330 ###--------------------------------------------------------------------------
1331 ### Remaining parsing machinery.
1332
1333 Type = K('type') - Name - D('{') - R(Policy) - D('}')
1334 def build_type(s, l, t):
1335   try:
1336     cat = CATEGORYMAP[t[0]]
1337   except KeyError:
1338     raise P.ParseException(s, loc, "Unknown category `%s'" % t[0])
1339   pols = t[1]
1340   if len(pols) == 1: pol = pols[0]
1341   else: pol = AndPolicy(pols)
1342   pol.setcategory(cat)
1343   return pol
1344 Type.setParseAction(build_type)
1345
1346 TARGETS = []
1347 class TargetJob (object):
1348   def __init__(me, targetdir, policies):
1349     me.targetdir = targetdir
1350     me.policies = policies
1351   def perform(me):
1352     TARGETS.append(me)
1353
1354 Target = K('target') - String - D('{') - R(Type) - D('}')
1355 def build_target(s, l, t):
1356   return TargetJob(t[0], t[1])
1357 Target.setParseAction(build_target)
1358
1359 VARS = { 'master': None }
1360 class VarsJob (object):
1361   def __init__(me, vars):
1362     me.vars = vars
1363   def perform(me):
1364     for k, v in me.vars:
1365       VARS[k] = v
1366
1367 Var = prop('master', String)
1368 Vars = K('vars') - D('{') - R(Var) - D('}')
1369 def build_vars(s, l, t):
1370   return VarsJob(t[0])
1371 Vars.setParseAction(build_vars)
1372
1373 TopLevel = Vars | Target
1374 Config = R(TopLevel)
1375 Config.ignore(P.pythonStyleComment)
1376
1377 ###--------------------------------------------------------------------------
1378 ### The directory grobbler.
1379
1380 def grobble(master, targets, noact = False):
1381   """
1382   Work through the MASTER directory, writing converted files to TARGETS.
1383
1384   The TARGETS are a list of `TargetJob' objects, each describing a target
1385   directory and a policy to apply to it.
1386
1387   If NOACT is true, then don't actually do anything permanent to the
1388   filesystem.
1389   """
1390
1391   ## Transform the targets into a more convenient data structure.
1392   tpolmap = []
1393   for t in targets:
1394     pmap = {}
1395     tpolmap.append(pmap)
1396     for p in t.policies: pmap.setdefault(p.cat, []).append(p)
1397
1398   ## Keep track of the current position in the master tree.
1399   dirs = []
1400
1401   ## And the files which haven't worked.
1402   broken = []
1403
1404   def grobble_file(master, pmap, targetdir, cohorts):
1405     ## Convert MASTER, writing the result to TARGETDIR.
1406     ##
1407     ## The COHORTS are actually (CAT, ID, COHORT) triples, where a COHORT is
1408     ## a list of (FILENAME, ID) pairs.
1409     ##
1410     ## Since this function might convert the MASTER file, the caller doesn't
1411     ## know the name of the output files, so we return then as a list.
1412
1413     done = set()
1414     st_m = OS.stat(master)
1415
1416     ## Work through each category listed and apply its policy.
1417     for cat, id, cohort in cohorts:
1418
1419       ## Go through the category's policies and see if any match.  If we fail
1420       ## here, see if there are more categories to try.
1421       for pol in pmap[cat]:
1422         acts = pol.actions(master, targetdir, id, cohort)
1423         if acts: break
1424       else:
1425         continue
1426
1427       ## Work through the targets one by one.
1428       for a in acts:
1429         done.add(a.target)
1430
1431         ## Find out whether the target file already exists and is up-to-date
1432         ## with respect to the master.  (Caution here with low-resolution
1433         ## timestamps.)  If it's OK, then just move on.
1434         try:
1435           st_t = OS.stat(a.target)
1436           if st_m.st_mtime < st_t.st_mtime or \
1437                  (st_m.st_ino, st_m.st_dev) == (st_t.st_ino, st_t.st_dev):
1438             continue
1439         except OSError, err:
1440           if err.errno not in (E.ENOENT, E.ENOTDIR):
1441             raise
1442
1443         ## We have real work to do.  If there's a current status message,
1444         ## it's the containing directory so flush it so that people know
1445         ## where we are.
1446         STATUS.commit()
1447
1448         ## Remove the target.  (A hardlink will fail if the target already
1449         ## exists.)
1450         if not noact:
1451           try:
1452             OS.unlink(a.target)
1453           except OSError, err:
1454             if err.errno not in (E.ENOENT, E.ENOTDIR):
1455               raise
1456
1457         ## Do whatever it is we decided to do.
1458         if noact:
1459           STATUS.commit(filestatus(master, a))
1460         else:
1461           a.perform()
1462
1463     ## We're done.  Return the names of the targets.
1464     return list(done)
1465
1466   @contextmanager
1467   def wrap(masterfile):
1468     ## Handle exceptions found while trying to convert a particular file or
1469     ## directory.
1470
1471     try:
1472       yield masterfile
1473
1474     ## Something bad happened.  Report the error, but continue.  (This list
1475     ## of exceptions needs a lot of work.)
1476     except (IOError, OSError), exc:
1477       STATUS.clear()
1478       STATUS.commit(filestatus(masterfile, 'failed (%s)' % exc))
1479       broken.append((masterfile, exc))
1480
1481   def grobble_dir(master, targets):
1482     ## Recursively convert files in MASTER, writing them to the TARGETS.
1483
1484     ## Keep track of the subdirectories we encounter, because we'll need to
1485     ## do all of those in one go at the end.
1486     subdirs = set()
1487
1488     ## Work through each target directory in turn.
1489     for target, pmap in zip(targets, tpolmap):
1490
1491       ## Make sure the TARGET exists and is a directory.  It's a fundamental
1492       ## assumption of this program that the entire TARGET tree is
1493       ## disposable, so if something exists but isn't a directory, we should
1494       ## kill it.
1495       if OS.path.isdir(target):
1496         pass
1497       else:
1498         if OS.path.exists(target):
1499           STATUS.commit(filestatus(target, 'clear nondirectory'))
1500           if not noact:
1501             OS.unlink(target)
1502         STATUS.commit(filestatus(target, 'create directory'))
1503         if not noact:
1504           OS.mkdir(target)
1505
1506       ## Keep a list of things in the target.  As we convert files, we'll
1507       ## check them off.  Anything left over is rubbish and needs to be
1508       ## deleted.
1509       checklist = {}
1510       try:
1511         for i in OS.listdir(target):
1512           checklist[i] = False
1513       except OSError, err:
1514         if err.errno not in (E.ENOENT, E.ENOTDIR):
1515           raise
1516
1517       ## Keep track of the files in each category.
1518       catmap = {}
1519       todo = []
1520       done = []
1521
1522       ## Work through the master files.
1523       for f in sorted(OS.listdir(master)):
1524
1525         ## If the killswitch has been pulled then stop.  The whole idea is
1526         ## that we want to cause a clean shutdown if possible, so we don't
1527         ## want to do it in the middle of encoding because the encoding
1528         ## effort will have been wasted.  This is the only place we need to
1529         ## check.  If we've exited the loop, then clearing old files will
1530         ## probably be fast, and we'll either end up here when the recursive
1531         ## call returns or we'll be in the same boat as before, clearing old
1532         ## files, only up a level.  If worst comes to worst, we'll be killed
1533         ## forcibly somewhere inside `SH.rmtree', and that can continue where
1534         ## it left off.
1535         if KILLSWITCH.is_set():
1536           return
1537
1538         ## Do something with the file.
1539         with wrap(OS.path.join(master, f)) as masterfile:
1540
1541           ## If it's a directory then prepare to grobble it recursively, but
1542           ## don't do that yet.
1543           if OS.path.isdir(masterfile):
1544             subdirs.add(f)
1545             done.append(OS.path.join(target, f))
1546
1547           ## Otherwise it's a file.  Work out what kind, and stash it under
1548           ## the appropriate categories.  Later, we'll apply policy to the
1549           ## files, by category, and work out what to do with them all.
1550           else:
1551             mime = GIO.file_new_for_path(masterfile) \
1552                       .query_info('standard::content-type', 0) \
1553                       .get_content_type()
1554             cats = []
1555             for cat in pmap.iterkeys():
1556               id = cat.identify(masterfile, mime)
1557               if id is None: continue
1558               catmap.setdefault(cat, []).append((masterfile, id))
1559               cats.append((cat, id))
1560             if not cats:
1561               catmap.setdefault(None, []).append((masterfile, id))
1562             todo.append((masterfile, cats))
1563
1564       ## Work through the categorized files to see what actions to do for
1565       ## them.
1566       for masterfile, cats in todo:
1567         with wrap(masterfile):
1568           done += grobble_file(masterfile, pmap, target,
1569                                [(cat, id, catmap[cat]) for cat, id in cats])
1570
1571       ## Check the results off the list so that we don't clear it later.
1572       for f in done:
1573         checklist[OS.path.basename(f)] = True
1574
1575       ## Maybe there's stuff in the target which isn't accounted for.  Delete
1576       ## it: either the master has changed, or the policy for this target has
1577       ## changed.  Either way, the old files aren't wanted.
1578       for f in checklist:
1579         if not checklist[f]:
1580           STATUS.commit(filestatus(f, 'clear bogus file'))
1581           if not noact:
1582             bogus = OS.path.join(target, f)
1583             try:
1584               if OS.path.isdir(bogus):
1585                 SH.rmtree(bogus)
1586               else:
1587                 OS.unlink(bogus)
1588             except OSError, err:
1589               if err.errno != E.ENOENT:
1590                 raise
1591
1592     ## If there are subdirectories which want processing then do those.
1593     ## Keep the user amused by telling him where we are in the tree.
1594     for d in sorted(subdirs):
1595       dirs.append(d)
1596       STATUS.set('/'.join(dirs))
1597       with wrap(OS.path.join(master, d)) as masterdir:
1598         try:
1599           grobble_dir(masterdir,
1600                       [OS.path.join(target, d) for target in targets])
1601         finally:
1602           dirs.pop()
1603           STATUS.set('/'.join(dirs))
1604
1605   ## Right.  We're ready to go.
1606   grobble_dir(master, [t.targetdir for t in targets])
1607   return broken
1608
1609 ###--------------------------------------------------------------------------
1610 ### Command-line interface.
1611
1612 QUIS = OS.path.basename(SYS.argv[0])
1613
1614 def moan(msg):
1615   "Report a warning message to the user."
1616   SYS.stderr.write('%s: %s\n' % (QUIS, msg))
1617
1618 def die(msg):
1619   "Report a fatal error message to the user."
1620   moan(msg)
1621   SYS.exit(1)
1622
1623 def parse_opts(args):
1624   """
1625   Parse command-line arguments in ARGS.
1626
1627   Returns a Grobbler object and the MASTER and TARGET directories to be
1628   grobbled.
1629   """
1630
1631   ## Build the option parser object.
1632   op = OP.OptionParser(prog = QUIS, version = VERSION,
1633                        usage = '%prog [-in] [-t TIMEOUT] [-T TIMEOUT] '
1634                                'CONFIG',
1635                        description = """\
1636 Convert a directory tree of files according to the configuration file
1637 CONFIG.
1638 """)
1639
1640   ## Timeout handling.
1641   def cb_time(opt, ostr, arg, op):
1642     m = RX.match(r'\s*(\d+)\s*([dhms]?)\s*', arg)
1643     if not m:
1644       raise OP.OptionValueerror, 'bad time value `%s\'' % arg
1645     t, u = m.groups()
1646     t = int(t) * { '': 1, 's': 1, 'm': 60, 'h': 3600, 'd': 86400 }[u]
1647     setattr(op.values, opt.dest, t)
1648   op.add_option('-t', '--timeout', type = 'string', metavar = 'SECS',
1649                 dest = 'timeout',
1650                 help = 'stop processing nicely after SECS',
1651                 action = 'callback', callback = cb_time)
1652   op.add_option('-T', '--timeout-nasty', type = 'string', metavar = 'SECS',
1653                 dest = 'timeout_nasty',
1654                 help = 'stop processing unpleasantly after further SECS',
1655                 action = 'callback', callback = cb_time)
1656
1657   ## Other options.
1658   op.add_option('-i', '--interactive', action = 'store_true', dest = 'tty',
1659                 help = 'provide progress information')
1660   op.add_option('-n', '--no-act', action = 'store_true', dest = 'noact',
1661                 help = 'don\'t actually modify the filesystem')
1662
1663   ## Ready to rock.
1664   op.set_defaults(formats = [], noact = False,
1665                   timeout = None, timeout_nasty = 300)
1666   opts, args = op.parse_args(args)
1667
1668   ## Check that we got the non-option arguments that we want.
1669   if len(args) != 1:
1670     op.error('wrong number of arguments')
1671
1672   ## Act on the options.
1673   if opts.tty:
1674     STATUS.eyecandyp = True
1675   if opts.timeout is not None:
1676     to = TH.Thread(target = timeout,
1677                    args = (opts.timeout, opts.timeout_nasty))
1678     to.daemon = True
1679     to.start()
1680
1681   ## Parse the configuration file.
1682   with open(args[0]) as conf:
1683     jobs, = Config.parseFile(conf, True)
1684   for j in jobs:
1685     j.perform()
1686
1687   return opts
1688
1689 if __name__ == '__main__':
1690   opts = parse_opts(SYS.argv[1:])
1691   if 'master' not in VARS:
1692     die("no master directory set")
1693   broken = grobble(VARS['master'], TARGETS, opts.noact)
1694   if broken:
1695     moan('failed to convert some files:')
1696     for file, exc in broken:
1697       moan('%s: %s' % (file, exc))
1698     SYS.exit(1)
1699
1700   ## This is basically a successful completion: we did what we were asked to
1701   ## do.  It seems polite to report a message, though.
1702   ##
1703   ## Why don't we have a nonzero exit status?  The idea would be that a
1704   ## calling script would be interested that we used up all of our time, and
1705   ## not attempt to convert some other directory as well.  But that doesn't
1706   ## quite work.  Such a script would need to account correctly for time we
1707   ## had spent even if we complete successfully.  And if the script is having
1708   ## to watch the clock itself, it can do that without our help here.
1709   if KILLSWITCH.is_set():
1710     moan('killed by timeout')
1711
1712 ###----- That's all, folks --------------------------------------------------