X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/28ffcb2af24deaf7d77f62b48246b768e16c6a07..HEAD:/symm/multigen diff --git a/symm/multigen b/symm/multigen index d532cda7..8af0f108 100755 --- a/symm/multigen +++ b/symm/multigen @@ -30,52 +30,131 @@ import itertools as IT import optparse as OP import os as OS import re as RX -from cStringIO import StringIO +import sys as SYS +if SYS.version_info >= (3,): from io import StringIO +else: from cStringIO import StringIO from sys import argv, exit, stderr ###-------------------------------------------------------------------------- ### Utilities. -QUIS = OS.path.basename(argv[0]) +QUIS = OS.path.basename(argv[0]) # Program name, for use in errors. def die(msg): + """Report MSG as a fatal error, and exit.""" stderr.write('%s: %s\n' % (QUIS, msg)) exit(1) def indexed(seq): + """ + Generate pairs (I, X), where I counts from zero and X are the items of SEQ. + """ return IT.izip(IT.count(), seq) +if SYS.version_info >= (3,): + def func_name(func): return func.__name__ + IT.izip = zip +else: + def func_name(func): return func.func_name + +try: next +except NameError: + def next(obj): return obj.next() + ###-------------------------------------------------------------------------- ### Reading the input values. +## Map column names to (Relation, # index) pairs. COLMAP = {} class Cursor (object): + """ + A Cursor object keeps track of an iteration through a Relation. + + At any time, the Cursor has a `current' row; the individual cells of this + row may be retrieved using Python's standard indexing operator. The `step' + method advances to the next row (if there is one). The `reset' method + returns to row zero. + """ + def __init__(me, rel): + """ + Initialize a new Cursor object, tracking its way through a Relation REL. + + The new Cursor has row zero as its current row. The REL must not be + empty. + """ me._rel = rel - me._i = 0 - me._row = rel[0] + me.reset() + def step(me): + """ + Advance the Cursor to the next row. + + Returns False if there is no next row; otherwise True. + """ me._i += 1 if me._i >= len(me._rel): me._i = me._row = None return False me._row = me._rel[me._i] return True + def reset(me): + """ + Reset the Cursor, so that row zero is current again. + """ me._i = 0 me._row = me._rel[0] + def __getitem__(me, i): + """ + Return the item in column I of the Cursor's current row. + + The index must be acceptable to the underlying row object, but otherwise + the Cursor imposes no restrictions. Indices need not be numeric, for + example. + """ return me._row[i] + def __repr__(me): + """ + Return a text description of the Cursor, for diagnostic use. + """ return '#' % (me._rel, me._i, me._row) class CursorSet (object): + """ + A CursorSet iterates over the cartiesian product of a number of Relations. + + More precisely: it maintains a stack, each level of which tracks a number + of Relations. More Relations can be pushed onto this stack with the `push' + method, and removed with `pop'. The `step' method advances through the + cartesian product of the Relations in the top level of the stack -- the + `active' Relations. Columns from the current rows of all of the currently + known Relations -- whether active or not -- can be extracted using `get'. + """ + def __init__(me): + """ + Initialize a new CursorSet object. + + A new CursorSet has an empty stack. + """ me._map = {} me._stack = [] me._act = None + def push(me, rels): + """ + Push the new Relations RELS onto the stack and start iterating. + + The currently active Relations are pushed down. Those Relations which are + not already known to the CursorSet become the newly active collection. + (Relations which are already known are simply ignored.) + + Iteration traverses Relations on the right more rapidly. + """ cc = [] rr = [] for r in rels: @@ -85,7 +164,14 @@ class CursorSet (object): cc.append(c) me._stack.append((me._act, rr)) me._act = cc + def step(me): + """ + Advance the CursorSet through the currently active Relations. + + Return False if the active Relations have now been exhausted; otherwise + return True. + """ i = 0 while i < len(me._act): if me._act[i].step(): return True @@ -93,38 +179,92 @@ class CursorSet (object): me._act[i].reset() i += 1 return False + def pop(me): + """ + Pop the active Relations. + + Return to iterating over the previously active collection. + """ me._act, rels = me._stack.pop() for r in rels: del me._map[r] + def get(me, rel, i): + """ + Return the item with index I in the current row of Relation REL. + """ return me._map[rel][i] class Relation (object): + """ + A Relation keeps track of a table of data. + + A Relation consists of a `header', which is a sequence of string names, + and a rectangular array of data, each row of which has the same number of + items as the header. + + Relations can be iterated over using Cursors and CursorSets. + """ + def __init__(me, head): + """ + Initialize a new, empty Relation with header HEAD. + + The `COLMAP' dictionary is updated to map the names in the header to this + Relation and its column indices. + """ me._head = head me._rows = [] for i, c in indexed(head): COLMAP[c] = me, i + def addrow(me, row): + """ + Add a ROW to the Relation. + + The new row must have the correct number of entries. + """ if len(row) != len(me._head): die("mismatch: row `%s' doesn't match heading `%s'" % - (', '.join(row), ', '.join(head))) + (', '.join(row), ', '.join(me._head))) me._rows.append(row) + def __len__(me): + """Return the number of rows in the Relation.""" return len(me._rows) + def __getitem__(me, i): + """Return the Ith row of the Relation.""" return me._rows[i] + def __repr__(me): + """Return a textual description of the Relation, for diagnostic use.""" return '#' % me._head def read_immediate(word): + """ + Return a Relation constructed by parsing WORD. + + The WORD has the form `HEAD=ROW ROW ...', where the HEAD and ROWs are + comma-separated lists of strings which will form the relation's header and + rows respectively. There is no way to include an item which contains a + comma or whitespace. + """ head, rels = word.split('=', 1) rel = Relation([c.strip() for c in head.split(',')]) for row in rels.split(): rel.addrow([c.strip() for c in row.split(',')]) def read_file(spec): + """ + Return a Relation constructed from a file, according to SPEC. + + The SPEC has the form `FILE:HEAD', where FILE names a file, and HEAD is a + comma-separated list of strings to form the relation's header. Each line + from the file which is neither empty nor begins with `#' is split into + whitespace-separated words to form a row in the relation. There is no way + to include an item which contains whitespace. + """ file, head = spec.split(':', 1) rel = Relation([c.strip() for c in head.split(',')]) - cols = [c.strip() for c in head.split(',')] with open(file) as f: for line in f: line = line.strip() @@ -132,6 +272,13 @@ def read_file(spec): rel.addrow(line.split()) def read_thing(spec): + """ + Return a relation constructed from SPEC. + + If SPEC begins with `@' then read the relation from a file (see + `read_file'); otherwise interpret it as immediate data (see + `read_immediate'). + """ if spec.startswith('@'): read_file(spec[1:]) else: read_immediate(spec) @@ -139,64 +286,153 @@ def read_thing(spec): ### Template structure. class BasicTemplate (object): + """ + Base class for template objects. + + The protocol for templates consists of two methods: + + relations() Return a set of Relations mentioned at top-level in + substitutions in the template. + + subst(OUT, CS) Fill in the template, writing the output to the + stream OUT. The CS is a CursorSet object tracking + the current iteration state. + """ pass class LiteralTemplate (BasicTemplate): + """ + A LiteralTemplate outputs a fixed string. + """ + def __init__(me, text, **kw): + """ + Initialize a new LiteralTemplate object. TEXT is the text to be written. + """ super(LiteralTemplate, me).__init__(**kw) me._text = text + def relations(me): + """A LiteralTemplate contains no substitutions.""" return set() + def subst(me, out, cs): + """A LiteralTemplate just emits its text.""" out.write(me._text) + def __repr__(me): return '#' % me._text class TagTemplate (BasicTemplate): + """ + A TagTemplate object expands a substitution tag. + + It extracts an item from the current row of a relation, processes it + according to an operation, and outputs the result. + """ + def __init__(me, rel, i, op, **kw): + """ + Initialize a new TagTemplate object. + + REL is the relation from which to pick the output; I is the column index; + OP is a transformation to apply to the data, and may be None to indicate + that the data should not be transformed. + """ super(TagTemplate, me).__init__(**kw) me._rel = rel me._i = i me._op = op + def relations(me): + """The TagTemplate knows which relation it uses.""" return set([me._rel]) + def subst(me, out, cs): + """ + A TagTemplate extracts and transforms an item from the current row of + a relation. + """ val = cs.get(me._rel, me._i) if me._op is not None: val = me._op(val) out.write(val) + def __repr__(me): return '#' % me._rel._head[me._i] class SequenceTemplate (BasicTemplate): + """ + A SequenceTemplate concatenates a number of other templates. + """ + def __new__(cls, seq, **kw): + """ + Construct a template from a sequence SEQ of other templates. + + If SEQ is a singleton (which it often is) then return it directly; + otherwise construct a SequenceTemplate. + """ if len(seq) == 1: return seq[0] else: - me = super(SequenceTemplate, cls).__new__(cls, seq = seq, **kw) - tt = [] - cls = type(me) - for t in seq: - if isinstance(t, cls): tt += t._seq - else: tt.append(t) - me._seq = tt - return me + return super(SequenceTemplate, cls).__new__(cls, **kw) + def __init__(me, seq, **kw): + """ + Initialize a new SequenceTemplate object from SEQ. + + The sequence is flattened out: if SEQ contains SequenceTemplates then we + use their children directly, so that we don't have a useless tree. + """ super(SequenceTemplate, me).__init__(**kw) + tt = [] + cls = type(me) + for t in seq: + if isinstance(t, cls): tt += t._seq + else: tt.append(t) + me._seq = tt + def relations(me): + """ + The relations of a SequenceTemplate are the union of the relations of its + children. + """ rr = set() for t in me._seq: rr.update(t.relations()) return rr + def subst(me, out, cs): + """ + The output of a SequenceTemplate is the concatenation of the expansions + of its children. + """ for t in me._seq: t.subst(out, cs) + def __repr__(me): return '#' % me._seq class RepeatTemplate (BasicTemplate): + """ + A RepeatTemplate iterates its body over a number of relations. + """ + def __init__(me, sub): + """ + Initialize a new RepeatTemplate, given a template to act as its body. + """ me._sub = sub + def relations(me): + """ + A RepeatTemplate hides the relations of its body. + """ return set() + def subst(me, out, cs): + """ + Substitute a RepeatTemplate, by iterating over the relations mentioned in + its body template. + """ rr = me._sub.relations() for r in rr: if len(r) == 0: return @@ -205,6 +441,7 @@ class RepeatTemplate (BasicTemplate): me._sub.subst(out, cs) if not cs.step(): break cs.pop() + def __repr__(me): return '#' % me._sub @@ -212,110 +449,249 @@ class RepeatTemplate (BasicTemplate): ### Some slightly cheesy parsing machinery. class ParseState (object): + """ + A ParseState object keeps track of a parser's position in a file. + + The `curr' slot contains the current line under consideration. + """ + def __init__(me, file, text): + """ + Initialize a ParseState object. + + The FILE is a string naming the source file, and the TEXT is an iterator + over the file's lines. + """ me._file = file me._i = 0 me._it = iter(text.splitlines(True)) me.step() + def step(me): - try: me.curr = me._it.next() + """ + Advance the ParseState to the next line. + + Sets `curr' to the next line, or to None if the input is exhausted. + """ + try: me.curr = next(me._it) except StopIteration: me.curr = None else: me._i += 1 + def error(me, msg): + """ + Report a fatal error during parsing, attributing it to the current line. + """ die('%s:%d: %s' % (me._file, me._i, msg)) class token (object): + """ + A token object has no interesting properties other than its identity. + """ + def __init__(me, name): + """Initialize a new token, with the given NAME.""" me._name = name def __repr__(me): + """Return a description of the token, for diagnostic purposes.""" return '#<%s>' % me._name +## Some magical tokens useful during parsing. EOF = token('eof') END = token('end') +## Regular expressions matching substitution tags. R_SIMPLETAG = RX.compile(r'@ (\w+)', RX.VERBOSE) R_COMPLEXTAG = RX.compile(r'@ { (\w+) ((?: : \w+)*) }', RX.VERBOSE) +## A dictionary mapping operation names to functions which implement them. OPMAP = {} def defop(func): - name = func.func_name + """ + Decorator for substitution operator functions. + + Remember the operator in `OPMAP'; the operator's name is taken from FUNC's + name, removing a prefix `op_' if there is one. + + An operator function is given the raw value as an argument and should + return the transformed value. + """ + name = func_name(func) if name.startswith('op_'): name = name[3:] OPMAP[name] = func return func @defop -def op_u(val): return val.upper() +def op_u(val): + """@{COLUMN:u} -- the item in upper case.""" + return val.upper() + +@defop +def op_l(val): + """@{COLUMN:l} -- the item in upper case.""" + return val.lower() @defop -def op_l(val): return val.lower() +def op_f(val): + """@{COLUMN:f} -- the item, with `/' characters replaced by `-'.""" + return val.replace('/', '-') R_NOTIDENT = RX.compile(r'[^a-zA-Z0-9_]+') @defop -def op_c(val): return R_NOTIDENT.sub('_', val) +def op_c(val): + """ + @{COLUMN:c} -- the item, with non-alphanumeric sequences replaced with `_'. + """ + return R_NOTIDENT.sub('_', val) def _pairify(val): + """ + Split VAL into two, at an `=' sign. + + If VAL has the form `THIS=THAT' then return the pair (THIS, THAT); + otherwise return (VAL, VAL). + """ c = val.find('=') if c >= 0: return val[:c], val[c + 1:] else: return val, val @defop -def op_left(val): return _pairify(val)[0] +def op_left(val): + """@{COLUMN:left} -- the left-hand side of the item.""" + return _pairify(val)[0] @defop -def op_right(val): return _pairify(val)[1] +def op_right(val): + """@{COLUMN:right} -- the left-hand side of the item.""" + return _pairify(val)[1] def parse_text(ps): + """ + Parse a chunk of text from a ParseState. + + Stop when we get to something which looks like a template keyword, but + extract tags. Return the resulting template. + + Tags have the form `@COLUMN', or `@{COLUMN:OPERATOR:...}'. The text may + contain comments beginning `%#', which are ignored, and lines beginning + `%%' which have the initial `%' removed and are otherwise treated as normal + text (and, in particular, may contain tags). Other lines beginning with + `%' are directives and must be processed by our caller. + """ + + ## Starting out: no templates collected, and an empty buffer of literal + ## text. tt = [] lit = StringIO() + def spill(): + ## Spill accumulated literal text from `lit' into a LiteralTemplate + ## object. l = lit.getvalue() if l: tt.append(LiteralTemplate(l)) - lit.reset() + lit.seek(0) lit.truncate() + + ## Iterate over the lines of input. while True: line = ps.curr + + ## Stop if there's no more text; handle lines beginning with `%'. if line is None: break elif line.startswith('%'): if line.startswith('%#'): ps.step(); continue elif line.startswith('%%'): line = line[1:] else: break + + ## Work through the line, finding tags. i = 0 while True: + + ## If there are no more `@' signs, there can be no more tags, and we're + ## done. j = line.find('@', i) if j < 0: break + + ## Write the chunk we've found. lit.write(line[i:j]) + + ## If the next character is also `@' then this is an escape and we + ## should carry on. + if line[j:].startswith('@@'): + lit.write('@') + i = j + 2 + continue + + ## Parse the tag into a column name, and maybe some operators. m = R_SIMPLETAG.match(line, j) if not m: m = R_COMPLEXTAG.match(line, j) if not m: ps.error('invalid tag') col = m.group(1) try: rel, i = COLMAP[col] except KeyError: ps.error("unknown column `%s'" % col) - wholeop = None ops = m.lastindex >= 2 and m.group(2) + + ## If we have operators then look them up and compose them. + wholeop = None if ops: for opname in ops[1:].split(':'): try: op = OPMAP[opname] except KeyError: ps.error("unknown operation `%s'" % opname) if wholeop is None: wholeop = op else: wholeop = (lambda f, g: lambda x: f(g(x)))(op, wholeop) + + ## Emit a LiteralTemplate for the accumulated text, and a TagTemplate + ## for the tag. spill() tt.append(TagTemplate(rel, i, wholeop)) + + ## Continue from after the tag. i = m.end() + + ## Finished a line. Write out the remainder of the line and move onto + ## the next. lit.write(line[i:]) ps.step() + + ## Run out of things to do. Flush out the rest of the literal text and + ## combine the templates. spill() return SequenceTemplate(tt) +## A dictionary mapping regular expressions to directive-processing functions. DIRECT = [] def direct(rx): + """ + Function decorator for template file directives. + + Associate the regular expression RX with the function in `DIRECT'. + Directive functions are invoked as FUNC(PS, M), where PS is the ParseState, + and M is the match object resulting from matching RX against the directive + text. + """ def _(func): DIRECT.append((RX.compile(rx, RX.VERBOSE), func)) return func return _ def parse_template(ps): + """ + Parse a single template from the ParseState PS. + + A single template is either a chunk of text (parsed by `parse_text') or a + directive (handled by the appropriate function in `DIRECT'). + + Returns either a template object, or a special token. In particular, `EOF' + is returned if we run out of text; directives may return other tokens. + """ + + ## Skip initial comments. Otherwise we might end up with an empty + ## SequenceTemplate here. while ps.curr is not None and ps.curr.startswith('%#'): ps.step() + + ## If we've run out of input, return `EOF' here. A line beginning `%%', or + ## not beginning `%', means we've found a chunk of text. Otherwise find + ## the right directive handler. if ps.curr is None: return EOF elif ps.curr.startswith('%'): if ps.curr.startswith('%%'): return parse_text(ps) @@ -330,6 +706,16 @@ def parse_template(ps): return parse_text(ps) def parse_templseq(ps, nestp): + """ + Parse a sequence of templates from the ParseState PS. + + Calls `parse_template' repeatedly If NESTP is true, then an `END' token + (presumably from a directive handler) is permitted and halts parsing; + otherwise `END' signifies an error. + + Returns a template object. + """ + tt = [] while True: t = parse_template(ps) @@ -344,13 +730,25 @@ def parse_templseq(ps, nestp): @direct(r'repeat') def dir_repeat(ps, m): + """ + %repeat + BODY + %end + + Iterate the body over the cartesian product of the relations mentioned + within. + """ return RepeatTemplate(parse_templseq(ps, True)) @direct(r'end') def dir_end(ps, m): + """%end -- an end marker used to delimet chunks of template.""" return END def compile_template(file, text): + """ + Compile TEXT into a template, attributing errors to FILE. + """ ps = ParseState(file, text) t = parse_templseq(ps, False) return t @@ -360,17 +758,21 @@ def compile_template(file, text): op = OP.OptionParser( description = 'Generates files by filling in simple templates', - usage = 'usage: %prog [-gl] FILE [COL,...=VAL,... ... | @FILE:COL,...] ...', + usage = 'usage: %prog {-l | -g TMPL} FILE [COL,...=VAL,... ... | @FILE:COL,...] ...', version = 'Catacomb version @VERSION@') +def cb_gen(opt, optstr, arg, op): + op.values.input = arg + op.values.mode = 'gen' for short, long, kw in [ ('-l', '--list', dict( action = 'store_const', const = 'list', dest = 'mode', help = 'list filenames generated')), ('-g', '--generate', dict( - action = 'store', metavar = 'PATH', dest = 'input', - help = 'generate output (default)'))]: + action = 'callback', metavar = 'TEMPLATE', + callback = cb_gen, type = 'string', + help = 'generate file(s) from TEMPLATE file'))]: op.add_option(short, long, **kw) -op.set_defaults(mode = 'gen') +op.set_defaults(mode = 'what?') opts, args = op.parse_args() if len(args) < 1: op.error('missing FILE') @@ -379,6 +781,9 @@ for rel in args[1:]: read_thing(rel) filetempl = compile_template('', filepat) def filenames(filetempl): + """ + Generate the filenames in the compiled filename template FILETEMPL. + """ cs = CursorSet() rr = filetempl.relations() for r in rr: @@ -391,8 +796,9 @@ def filenames(filetempl): if not cs.step(): break cs.pop() +## Main dispatch. if opts.mode == 'list': - for file, cs in filenames(filetempl): print file + for file, cs in filenames(filetempl): print(file) elif opts.mode == 'gen': with open(opts.input) as f: templ = RepeatTemplate(compile_template(opts.input, f.read())) @@ -402,6 +808,6 @@ elif opts.mode == 'gen': templ.subst(out, cs) OS.rename(new, file) else: - raise Exception, 'What am I doing here?' + die('What am I doing here?') ###----- That's all, folks --------------------------------------------------