From: Mark Wooding Date: Wed, 14 May 2014 20:35:40 +0000 (+0100) Subject: math/mpgen, symm/multigen: Add copious commentary. X-Git-Tag: 2.1.7~6 X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/e7abc7ea1c83b671db65884d7f8e4b8c57762083 math/mpgen, symm/multigen: Add copious commentary. --- diff --git a/math/mpgen b/math/mpgen index de03c3e6..7d11bfda 100644 --- a/math/mpgen +++ b/math/mpgen @@ -36,6 +36,12 @@ from sys import stdout ### Random utilities. def write_header(mode, name): + """ + Write a C-language file header. + + The header comment identifies the processing MODE, and the NAME of the + output file. + """ stdout.write("""\ /* -*-c-*- GENERATED by mpgen (%s) * @@ -45,19 +51,32 @@ def write_header(mode, name): """ % (mode, name)) def write_banner(text): + """Write a separator banner to the output, with header TEXT.""" stdout.write("/*----- %s %s*/\n" % (text, '-' * (66 - len(text)))) -class struct (object): pass +class struct (object): + """ + A struct object exists so that you can set attributes on it. + """ + pass R_IDBAD = RX.compile('[^0-9A-Za-z]') -def fix_name(name): return R_IDBAD.sub('_', name) +def fix_name(name): + """Replace non-alphanumeric characters in NAME with underscores.""" + return R_IDBAD.sub('_', name) ###-------------------------------------------------------------------------- ### Determining the appropriate types. +## A dictionary mapping type tags to subclasses of BasicIntType. TYPEMAP = {} class IntClass (type): + """ + The IntClass is a metaclass for integer-type classes. + + It associates the type class with its tag in the `TYPEMAP' dictionary. + """ def __new__(cls, name, supers, dict): c = type.__new__(cls, name, supers, dict) try: TYPEMAP[c.tag] = c @@ -65,6 +84,36 @@ class IntClass (type): return c class BasicIntType (object): + """ + A base class for integer-type classes, providing defaults and protocol. + + Integer-type classes have the following attributes and methods. + + preamble Some code to be emitted to a header file to make use + of the type. Defaults to the empty string. + + typedef_prefix A prefix to be written before the type's name in a + `typedef' declaration, possibly to muffle warnings. + Defaults to the empty string. + + literalfmt A Python `%' format string for generating literal + values of the type; used by the default `literal' + method (so if you override it then you don't need to + set this). Defaults to `%su'. + + literal(VALUE, [FMT]) Emit a literal value of the type, encoding VALUE. + The default FMT has the form `0x%0Nx', so as to emit + in hex with the appropriate number of leading zeros. + + Instances also carry additional attributes. + + bits The width of the integer type, in bits. + + rank An integer giving the conversion rank of the type. + Higher values generally denote wider types. + + litwd The width of a literal of the type, in characters. + """ __metaclass__ = IntClass preamble = '' typedef_prefix = '' @@ -113,7 +162,36 @@ class UIntMaxType (BasicIntType): preamble = "\n#include \n" class TypeChoice (object): + """ + A TypeChoice object selects C integer types for multiprecision integers. + + It exports its decisions as attributes: + + mpwbits The width of an `mpw', in bits. + + mpw The integer type for single-precision values. + + mpd The integer type for double-precision values. + + ti An object bearing raw information about the available + integer types, as follows... + + TYPEINFO A list of items (TAG, BITS) describing the widths of + the available types suitable for multiprecision use, + in ascending order of rank. + + LIMITS A list of items (TAG, LO, HI) describing the ranges + of integer types, for constructing the `mplimits' + files. + """ + def __init__(me, tifile): + """ + Read the definitions from TIFILE, and select types appropriately. + + The TIFILE is a tiny fragment of Python code which should set `TYPEINFO' + and `LIMITS' in its global namespace. + """ ## Load the captured type information. me.ti = TY.ModuleType('typeinfo') @@ -165,9 +243,25 @@ class TypeChoice (object): ###-------------------------------------------------------------------------- ### Outputting constant multiprecision integers. +## The margin for outputting MP literals. MARGIN = 72 def write_preamble(): + """ + Write a preamble for files containing multiprecision literals. + + We define a number of macros for use by the rest of the code: + + ZERO_MP An `mp' initializer denoting the value zero. + + POS_MP(NAME) Constructs an `mp' initializer denoting a positive + integer whose limbs were emitted with the given + NAME. + + NEG_MP(NAME) Constructs an `mp' initializer denoting a negative + integer whose limbs were emitted with the given + NAME. + """ stdout.write(""" #include #define MP_(name, flags) \\ @@ -180,13 +274,24 @@ def write_preamble(): """) def write_limbs(name, x): + """ + Write the limbs of the value X, with the given NAME. + """ + + ## We don't need to do anything special for zero. if not x: return + + ## Start on the limb vector. No delimiter needed, and we shall need to + ## start a new line before any actual output. We want to write the + ## absolute value here, because we use a signed-magnitude representation. stdout.write("\nstatic const mpw %s__mpw[] = {" % name) sep = '' pos = MARGIN if x < 0: x = -x mask = (1 << TC.mpwbits) - 1 + ## We work from the little-end up, picking off `mpwbits' at a time. Start + ## a new line if we can't fit the value on the current one. while x > 0: w, x = x & mask, x >> TC.mpwbits f = TC.mpw.literal(w) @@ -198,17 +303,30 @@ def write_limbs(name, x): pos += len(f) + 2 sep = ',' + ## We're done. Finish off the initializer. stdout.write("\n};\n") def mp_body(name, x): + """ + Write the body of an `mp' object, for the value NAME. + """ return "%s_MP(%s)" % (x >= 0 and "POS" or "NEG", name) ###-------------------------------------------------------------------------- ### Mode definition machinery. +## A dictionary mapping mode names to their handler functions. MODEMAP = {} def defmode(func): + """ + Function decorator: associate the function with the name of a mode. + + The mode name is taken from the function's name: a leading `m_' is stripped + off, if there is one. Mode functions are invoked with the positional + arguments from the command and are expected to write their output to + stdout. + """ name = func.func_name if name.startswith('m_'): name = name[2:] MODEMAP[name] = func @@ -219,16 +337,39 @@ def defmode(func): @defmode def m_mptypes(): + """ + Write the `mptypes.h' header. + + This establishes the following types. + + mpw An integer type for single-precision values. + + mpd An integer type for double-precision values. + + And, for t being each of `w' or `d', the following constants: + + MPt_BITS The width of the type, in bits. + + MPt_P2 The smallest integer k such that 2^k is not less than + MPt_BITS. (This is used for binary searches.) + + MPt_MAX The largest value which may be stored in an object of + the type. + """ + + ## Write the preamble. write_header("mptypes", "mptypes.h") stdout.write("""\ #ifndef CATACOMB_MPTYPES_H #define CATACOMB_MPTYPES_H """) + ## Write any additional premable for the types we've selected. have = set([TC.mpw, TC.mpd]) for t in have: stdout.write(t.preamble) + ## Emit the types and constants. for label, t, bits in [('mpw', TC.mpw, TC.mpwbits), ('mpd', TC.mpd, TC.mpwbits*2)]: LABEL = label.upper() @@ -240,6 +381,7 @@ def m_mptypes(): stdout.write("#define %s_MAX %s\n" % (LABEL, t.literal((1 << bits) - 1, "%d"))) + ## Done. stdout.write("\n#endif\n") ###-------------------------------------------------------------------------- @@ -247,9 +389,21 @@ def m_mptypes(): @defmode def m_mplimits_c(): + """ + Write the `mplimits.c' source file. + + This contains `mp' constants corresponding to the various integer types' + upper and lower bounds. The output is a vector `mp_limits' consisting of + the distinct nonzero bounds, in order of their first occurrence in the + `ti.LIMITS' list. + """ + + ## Write the preamble. write_header("mplimits_c", "mplimits.c") stdout.write('#include "mplimits.h"\n') write_preamble() + + ## Write out limbs for limits as we come across them. seen = {} v = [] def write(x): @@ -261,6 +415,7 @@ def m_mplimits_c(): write(lo) write(hi) + ## Write the main vector. stdout.write("\nmp mp_limits[] = {") i = 0 sep = "\n " @@ -272,6 +427,14 @@ def m_mplimits_c(): @defmode def m_mplimits_h(): + """ + Write the `mplimits.h' source file. + + For each type TAG, this defines constants MP_TAG_MIN and MP_TAG_MAX + representing the lower and upper bounds of the type. + """ + + ## Write the preamble. write_header("mplimits_h", "mplimits.h") stdout.write("""\ #ifndef CATACOMB_MPLIMITS_H @@ -285,6 +448,8 @@ extern mp mp_limits[]; """) + ## Now define constants for the bounds. Things which are zero can go to + ## our existing `MP_ZERO'; otherwise we index the `mp_limits' vector. seen = { 0: "MP_ZERO" } slot = [0] def find(x): @@ -298,12 +463,19 @@ extern mp mp_limits[]; stdout.write("#define MP_%s_MIN %s\n" % (tag, find(lo))) stdout.write("#define MP_%s_MAX %s\n" % (tag, find(hi))) + ## All done. stdout.write("\n#endif\n") ###-------------------------------------------------------------------------- ### Group tables. class GroupTableClass (type): + """ + Metaclass for group tables, which registers them in the `MODEMAP'. + + Such a class must define an attribute `mode' giving the mode name, and a + class method `run' which writes the necessary output. + """ def __new__(cls, name, supers, dict): c = type.__new__(cls, name, supers, dict) try: mode = c.mode @@ -312,10 +484,101 @@ class GroupTableClass (type): return c class GroupTable (object): + """ + Base class for group tables objects. + + A `group table' is a table of constants, typically defining a cyclic group + or something similar. We read the values from an input file, and write + them out as C definitions. These have a somewhat stereotyped format, so we + can mostly handle them uniformly. + + Specifically, input files consist of lines which are split into + whitespace-separated words. Blank lines, and lines beginning with `#', are + ignored. The remaining lines are gathered together into stanzas of the + form + + KEYWORD NAME [HEAD-VALUE ...] + SLOT VALUE + ... + + (Indentation is shown for clarity only.) Such a stanza describes a group + NAME; some slots are assigned values from the headline, and others from + their own individual lines. + + Subclasses must define the following attributes. + + data_t The name of the type for describing a particular + group. + + entry_t The name of the type which associates a name with + some group data; this will be defined as + + typedef struct ENTRY_T { + const char *name; + DATA_T *data; + } ENTRY_T; + + or similar. + + filename The filename, typically `SOMETHING.c', to put in the + output header. + + header The header file to include, so as to establish the + necessary types and other definitions. + + keyword The keyword beginning a new definition in the input + file. The default is `group'. + + mode The mode name, used to invoke this kind of table + operation (used by GroupTableClass). + + slots A vector of slot objects (see BaseSlot for the + protocol) describing the structure of this particular + kind of group, in the order they should be written in + an initializer. + + Instances carry an `st' attribute, which contains a `struct' object in + which slots can maintain some state. This object carries the following + attributes maintained by this class. + + d A dictionary mapping slots (not their names!) to + values assigned in the current stanza. This is reset + at the start of each stanza. Slot implementations + a free to use this or not, and the representation is + internal to the specific slot class. + + mpmap A dictionary mapping values (integers, or `None') to + C initializers (typically, actually, macro + invocations). + + name The name of the group currently being parsed. + + nextmp Index for the next `mp' object to be written. + """ + + ## Additional attributes, for internal use: + ## + ## _defs A set of known names for groups. + ## + ## _headslots A list of slots filled in from the headline. + ## + ## _names A list of pairs (ALIAS, DATA) mapping alias names to + ## the actual group data. + ## + ## _slotmap A dictionary mapping slot names to their + ## descriptions. + __metaclass__ = GroupTableClass + + ## Default values. keyword = 'group' slots = [] + def __init__(me): + """ + Initialize a group table object. + """ + me.st = st = struct() st.nextmp = 0 st.mpmap = { None: 'NO_MP', 0: 'ZERO_MP' } @@ -325,10 +588,22 @@ class GroupTable (object): me._defs = set() me._slotmap = dict([(s.name, s) for s in me.slots]) me._headslots = [s for s in me.slots if s.headline] + def _flush(me): + """ + Write out the data for a group once we've detected the end of its stanza. + """ + + ## If there's no current stanza, then do nothing. if me.st.name is None: return + + ## Start emitting the object. stdout.write("/* --- %s --- */\n" % me.st.name) + + ## Get the various slots to compute themselves. for s in me.slots: s.setup(me.st) + + ## Write the initializer. stdout.write("\nstatic %s c_%s = {" % (me.data_t, fix_name(me.st.name))) sep = "\n " for s in me.slots: @@ -336,40 +611,78 @@ class GroupTable (object): s.write(me.st) sep = ",\n " stdout.write("\n};\n\n") + + ## Clear the state for the next stanza. me.st.d = {} me.st.name = None + @classmethod def run(cls, input): + """ + Main output for a group table. Reads the file INPUT. + """ + + ## Make an object for us to work on. me = cls() + + ## Write the output preamble. write_header(me.mode, me.filename) stdout.write('#include "%s"\n' % me.header) write_preamble() stdout.write("#define NO_MP { 0, 0, 0, 0, 0, 0 }\n\n") + + ## The main group data. This will contain a `data_t' object for each + ## group we read. We'll also build the name to data map as we go. write_banner("Group data") stdout.write('\n') with open(input) as file: for line in file: + + ## Parse the line into fields. ff = line.split() if not ff or ff[0].startswith('#'): continue + if ff[0] == 'alias': + ## An alias. Just remember this. if len(ff) != 3: raise Exception, "wrong number of alias arguments" me._flush() me._names.append((ff[1], ff[2])) + elif ff[0] == me.keyword: + ## A headline for a new group. + + ## Check the headline syntax. Headline slots may be set here, or + ## later by name. if len(ff) < 2 or len(ff) > 2 + len(me._headslots): raise Exception, "bad number of headline arguments" + + ## Flush out the previous stanza. me._flush() + + ## Remember the new stanza's name, and add it to the list. me.st.name = name = ff[1] me._defs.add(name) me._names.append((name, name)) + + ## Set headline slots from the remaining headline words. for f, s in zip(ff[2:], me._headslots): s.set(me.st, f) + elif ff[0] in me._slotmap: + ## A slot assignment. Get the slot to store a value. + if me.st.name is None: + raise Exception, "no group currently being defined" if len(ff) != 2: raise Exception, "bad number of values for slot `%s'" % ff[0] me._slotmap[ff[0]].set(me.st, ff[1]) + else: + ## Something incomprehensible. raise Exception, "unknown keyword `%s'" % ff[0] + + ## End of the input. Write out the final stanza. me._flush() + + ## Now for the name-to-data mapping. write_banner("Main table") stdout.write("\nconst %s %s[] = {\n" % (me.entry_t, me.tabname)) for a, n in me._names: @@ -377,46 +690,139 @@ class GroupTable (object): raise Exception, "alias `%s' refers to unknown group `%s'" % (a, n) stdout.write(' { "%s", &c_%s },\n' % (a, fix_name(n))) stdout.write(" { 0, 0 }\n};\n\n") + + ## We're done. write_banner("That's all, folks") class BaseSlot (object): + """ + Base class for slot types. + + The slot protocol works as follows. Throughout, ST is a state object as + maintained by a GroupTable. + + __init__(NAME, [HEADLINE], [OMITP], [ALLOWP], ...) + Initialize the slot. The NAME identifies the slot, + and the keyword used to set it in input files. If + HEADLINE is true then the slot can be set from the + stanza headline. OMITP and ALLOWP are optional + functions: if OMITP(ST) returns true then the slot + may be omitted; conversely, if ALLOWP(ST, VALUE) + returns false then the slot cannot be assigned the + given VALUE. Other arguments may be allowed by + specific slot types. + + set(ST, VALUE) Set the slot to the given VALUE, typically by setting + ST.d[me]. The default just stores the VALUE without + interpreting it. + + setup(ST) Prepare the slot for output. The default method just + checks that ST.d contains a mapping for the slot. + All of the stanza's slots are set up before starting + on the initializer for the group data, so slots can + use this opportunity to emit preparatory definitions. + + write(ST) Write an initializer for the slot to standard + output. There is no default. + + The following attributes are exported. + + headline A flag: can the slot be initialized from the stanza + headline? + + name The slot's name. + """ + def __init__(me, name, headline = False, omitp = None, allowp = None): + """ + Initialize a new slot object, setting the necessary attributes. + """ me.name = name me.headline = headline me._omitp = omitp me._allowp = allowp + def set(me, st, value): + """ + Store a VALUE for the slot. + """ if me._allowp and not me._allowp(st, value): raise Exception, "slot `%s' not allowed here" % me.name st.d[me] = value + def setup(me, st): + """ + Prepare the slot for output, checking its value and so on. + """ if me not in st.d and (not me._omitp or not me._omitp(st)): raise Exception, "missing slot `%s'" % me.name class EnumSlot (BaseSlot): + """ + An EnumSlot object represents a slot which can contain one of a number of + named values. + + An omitted value is written as a literal `0'. + """ + def __init__(me, name, prefix, values, **kw): + """ + Initialize an EnumSlot object. + + The VALUES are a set of value names. On output, a value is converted to + uppercase, and prefixed by the PREFIX and an underscore. + """ super(EnumSlot, me).__init__(name, **kw) me._values = set(values) me._prefix = prefix + def set(me, st, value): + """ + Check that the VALUE is one of the ones we know. + """ if value not in me._values: raise Exception, "invalid %s value `%s'" % (me.name, value) super(EnumSlot, me).set(st, value) + def write(me, st): + """ + Convert the slot value to the C constant name. + """ try: stdout.write('%s_%s' % (me._prefix, st.d[me].upper())) except KeyError: stdout.write('0') class MPSlot (BaseSlot): + """ + An MPSlot object represents a slot which can contain a multiprecision + integer. + + An omitted value is written as a invalid `mp' object. + """ + def set(me, st, value): + """ + Set a value; convert it to a Python integer. + """ super(MPSlot, me).set(st, long(value, 0)) + def setup(me, st): + """ + Prepare to write the slot. + + If this is a new integer, then write out a limb vector. Names for the + limbs are generated unimaginitively, using a counter. + """ super(MPSlot, me).setup(st) v = st.d.get(me) if v not in st.mpmap: write_limbs('v%d' % st.nextmp, v) st.mpmap[v] = mp_body('v%d' % st.nextmp, v) st.nextmp += 1 + def write(me, st): + """ + Write out an `mp' initializer for the slot. + """ stdout.write(st.mpmap[st.d.get(me)]) class BinaryGroupTable (GroupTable): @@ -473,11 +879,14 @@ for shortopt, longopt, kw in [ op.set_defaults(typeinfo = './typeinfo.py') opts, args = op.parse_args() +## Parse the positional arguments. if len(args) < 1: op.error('missing MODE') mode = args[0] +## Establish the choice of low-level C types. TC = TypeChoice(opts.typeinfo) +## Find the selected mode, and invoke the appropriate handler. try: modefunc = MODEMAP[mode] except KeyError: op.error("unknown mode `%s'" % mode) modefunc(*args[1:]) diff --git a/symm/multigen b/symm/multigen index 20024bc5..f498ce2a 100755 --- a/symm/multigen +++ b/symm/multigen @@ -36,45 +36,113 @@ from sys import argv, exit, stderr ###-------------------------------------------------------------------------- ### Utilities. -QUIS = OS.path.basename(argv[0]) +QUIS = OS.path.basename(argv[0]) # Program name, for use in errors. def die(msg): + """Report MSG as a fatal error, and exit.""" stderr.write('%s: %s\n' % (QUIS, msg)) exit(1) def indexed(seq): + """ + Generate pairs (I, X), where I counts from zero and X are the items of SEQ. + """ return IT.izip(IT.count(), seq) ###-------------------------------------------------------------------------- ### Reading the input values. +## Map column names to (Relation, # index) pairs. COLMAP = {} class Cursor (object): + """ + A Cursor object keeps track of an iteration through a Relation. + + At any time, the Cursor has a `current' row; the individual cells of this + row may be retrieved using Python's standard indexing operator. The `step' + method advances to the next row (if there is one). The `reset' method + returns to row zero. + """ + def __init__(me, rel): + """ + Initialize a new Cursor object, tracking its way through a Relation REL. + + The new Cursor has row zero as its current row. The REL must not be + empty. + """ me._rel = rel me.reset() + def step(me): + """ + Advance the Cursor to the next row. + + Returns False if there is no next row; otherwise True. + """ me._i += 1 if me._i >= len(me._rel): me._i = me._row = None return False me._row = me._rel[me._i] return True + def reset(me): + """ + Reset the Cursor, so that row zero is current again. + """ me._i = 0 me._row = me._rel[0] + def __getitem__(me, i): + """ + Return the item in column I of the Cursor's current row. + + The index must be acceptable to the underlying row object, but otherwise + the Cursor imposes no restrictions. Indices need not be numeric, for + example. + """ return me._row[i] + def __repr__(me): + """ + Return a text description of the Cursor, for diagnostic use. + """ return '#' % (me._rel, me._i, me._row) class CursorSet (object): + """ + A CursorSet iterates over the cartiesian product of a number of Relations. + + More precisely: it maintains a stack, each level of which tracks a number + of Relations. More Relations can be pushed onto this stack with the `push' + method, and removed with `pop'. The `step' method advances through the + cartesian product of the Relations in the top level of the stack -- the + `active' Relations. Columns from the current rows of all of the currently + known Relations -- whether active or not -- can be extracted using `get'. + """ + def __init__(me): + """ + Initialize a new CursorSet object. + + A new CursorSet has an empty stack. + """ me._map = {} me._stack = [] me._act = None + def push(me, rels): + """ + Push the new Relations RELS onto the stack and start iterating. + + The currently active Relations are pushed down. Those Relations which are + not already known to the CursorSet become the newly active collection. + (Relations which are already known are simply ignored.) + + Iteration traverses Relations on the right more rapidly. + """ cc = [] rr = [] for r in rels: @@ -84,7 +152,14 @@ class CursorSet (object): cc.append(c) me._stack.append((me._act, rr)) me._act = cc + def step(me): + """ + Advance the CursorSet through the currently active Relations. + + Return False if the active Relations have now been exhausted; otherwise + return True. + """ i = 0 while i < len(me._act): if me._act[i].step(): return True @@ -92,35 +167,90 @@ class CursorSet (object): me._act[i].reset() i += 1 return False + def pop(me): + """ + Pop the active Relations. + + Return to iterating over the previously active collection. + """ me._act, rels = me._stack.pop() for r in rels: del me._map[r] + def get(me, rel, i): + """ + Return the item with index I in the current row of Relation REL. + """ return me._map[rel][i] class Relation (object): + """ + A Relation keeps track of a table of data. + + A Relation consists of a `header', which is a sequence of string names, + and a rectangular array of data, each row of which has the same number of + items as the header. + + Relations can be iterated over using Cursors and CursorSets. + """ + def __init__(me, head): + """ + Initialize a new, empty Relation with header HEAD. + + The `COLMAP' dictionary is updated to map the names in the header to this + Relation and its column indices. + """ me._head = head me._rows = [] for i, c in indexed(head): COLMAP[c] = me, i + def addrow(me, row): + """ + Add a ROW to the Relation. + + The new row must have the correct number of entries. + """ if len(row) != len(me._head): die("mismatch: row `%s' doesn't match heading `%s'" % (', '.join(row), ', '.join(me._head))) me._rows.append(row) + def __len__(me): + """Return the number of rows in the Relation.""" return len(me._rows) + def __getitem__(me, i): + """Return the Ith row of the Relation.""" return me._rows[i] + def __repr__(me): + """Return a textual description of the Relation, for diagnostic use.""" return '#' % me._head def read_immediate(word): + """ + Return a Relation constructed by parsing WORD. + + The WORD has the form `HEAD=ROW ROW ...', where the HEAD and ROWs are + comma-separated lists of strings which will form the relation's header and + rows respectively. There is no way to include an item which contains a + comma or whitespace. + """ head, rels = word.split('=', 1) rel = Relation([c.strip() for c in head.split(',')]) for row in rels.split(): rel.addrow([c.strip() for c in row.split(',')]) def read_file(spec): + """ + Return a Relation constructed from a file, according to SPEC. + + The SPEC has the form `FILE:HEAD', where FILE names a file, and HEAD is a + comma-separated list of strings to form the relation's header. Each line + from the file which is neither empty nor begins with `#' is split into + whitespace-separated words to form a row in the relation. There is no way + to include an item which contains whitespace. + """ file, head = spec.split(':', 1) rel = Relation([c.strip() for c in head.split(',')]) with open(file) as f: @@ -130,6 +260,13 @@ def read_file(spec): rel.addrow(line.split()) def read_thing(spec): + """ + Return a relation constructed from SPEC. + + If SPEC begins with `@' then read the relation from a file (see + `read_file'); otherwise interpret it as immediate data (see + `read_immediate'). + """ if spec.startswith('@'): read_file(spec[1:]) else: read_immediate(spec) @@ -137,42 +274,104 @@ def read_thing(spec): ### Template structure. class BasicTemplate (object): + """ + Base class for template objects. + + The protocol for templates consists of two methods: + + relations() Return a set of Relations mentioned at top-level in + substitutions in the template. + + subst(OUT, CS) Fill in the template, writing the output to the + stream OUT. The CS is a CursorSet object tracking + the current iteration state. + """ pass class LiteralTemplate (BasicTemplate): + """ + A LiteralTemplate outputs a fixed string. + """ + def __init__(me, text, **kw): + """ + Initialize a new LiteralTemplate object. TEXT is the text to be written. + """ super(LiteralTemplate, me).__init__(**kw) me._text = text + def relations(me): + """A LiteralTemplate contains no substitutions.""" return set() + def subst(me, out, cs): + """A LiteralTemplate just emits its text.""" out.write(me._text) + def __repr__(me): return '#' % me._text class TagTemplate (BasicTemplate): + """ + A TagTemplate object expands a substitution tag. + + It extracts an item from the current row of a relation, processes it + according to an operation, and outputs the result. + """ + def __init__(me, rel, i, op, **kw): + """ + Initialize a new TagTemplate object. + + REL is the relation from which to pick the output; I is the column index; + OP is a transformation to apply to the data, and may be None to indicate + that the data should not be transformed. + """ super(TagTemplate, me).__init__(**kw) me._rel = rel me._i = i me._op = op + def relations(me): + """The TagTemplate knows which relation it uses.""" return set([me._rel]) + def subst(me, out, cs): + """ + A TagTemplate extracts and transforms an item from the current row of + a relation. + """ val = cs.get(me._rel, me._i) if me._op is not None: val = me._op(val) out.write(val) + def __repr__(me): return '#' % me._rel._head[me._i] class SequenceTemplate (BasicTemplate): + """ + A SequenceTemplate concatenates a number of other templates. + """ + def __new__(cls, seq, **kw): + """ + Construct a template from a sequence SEQ of other templates. + + If SEQ is a singleton (which it often is) then return it directly; + otherwise construct a SequenceTemplate. + """ if len(seq) == 1: return seq[0] else: return super(SequenceTemplate, cls).__new__(cls, seq = seq, **kw) def __init__(me, seq, **kw): + """ + Initialize a new SequenceTemplate object from SEQ. + + The sequence is flattened out: if SEQ contains SequenceTemplates then we + use their children directly, so that we don't have a useless tree. + """ super(SequenceTemplate, me).__init__(**kw) tt = [] cls = type(me) @@ -182,20 +381,46 @@ class SequenceTemplate (BasicTemplate): me._seq = tt def relations(me): + """ + The relations of a SequenceTemplate are the union of the relations of its + children. + """ rr = set() for t in me._seq: rr.update(t.relations()) return rr + def subst(me, out, cs): + """ + The output of a SequenceTemplate is the concatenation of the expansions + of its children. + """ for t in me._seq: t.subst(out, cs) + def __repr__(me): return '#' % me._seq class RepeatTemplate (BasicTemplate): + """ + A RepeatTemplate iterates its body over a number of relations. + """ + def __init__(me, sub): + """ + Initialize a new RepeatTemplate, given a template to act as its body. + """ me._sub = sub + def relations(me): + """ + A RepeatTemplate hides the relations of its body. + """ return set() + def subst(me, out, cs): + """ + Substitute a RepeatTemplate, by iterating over the relations mentioned in + its body template. + """ rr = me._sub.relations() for r in rr: if len(r) == 0: return @@ -204,6 +429,7 @@ class RepeatTemplate (BasicTemplate): me._sub.subst(out, cs) if not cs.step(): break cs.pop() + def __repr__(me): return '#' % me._sub @@ -211,78 +437,174 @@ class RepeatTemplate (BasicTemplate): ### Some slightly cheesy parsing machinery. class ParseState (object): + """ + A ParseState object keeps track of a parser's position in a file. + + The `curr' slot contains the current line under consideration. + """ + def __init__(me, file, text): + """ + Initialize a ParseState object. + + The FILE is a string naming the source file, and the TEXT is an iterator + over the file's lines. + """ me._file = file me._i = 0 me._it = iter(text.splitlines(True)) me.step() + def step(me): + """ + Advance the ParseState to the next line. + + Sets `curr' to the next line, or to None if the input is exhausted. + """ try: me.curr = me._it.next() except StopIteration: me.curr = None else: me._i += 1 + def error(me, msg): + """ + Report a fatal error during parsing, attributing it to the current line. + """ die('%s:%d: %s' % (me._file, me._i, msg)) class token (object): + """ + A token object has no interesting properties other than its identity. + """ + def __init__(me, name): + """Initialize a new token, with the given NAME.""" me._name = name def __repr__(me): + """Return a description of the token, for diagnostic purposes.""" return '#<%s>' % me._name +## Some magical tokens useful during parsing. EOF = token('eof') END = token('end') +## Regular expressions matching substitution tags. R_SIMPLETAG = RX.compile(r'@ (\w+)', RX.VERBOSE) R_COMPLEXTAG = RX.compile(r'@ { (\w+) ((?: : \w+)*) }', RX.VERBOSE) +## A dictionary mapping operation names to functions which implement them. OPMAP = {} def defop(func): + """ + Decorator for substitution operator functions. + + Remember the operator in `OPMAP'; the operator's name is taken from FUNC's + name, removing a prefix `op_' if there is one. + + An operator function is given the raw value as an argument and should + return the transformed value. + """ name = func.func_name if name.startswith('op_'): name = name[3:] OPMAP[name] = func return func @defop -def op_u(val): return val.upper() +def op_u(val): + """@{COLUMN:u} -- the item in upper case.""" + return val.upper() @defop -def op_l(val): return val.lower() +def op_l(val): + """@{COLUMN:l} -- the item in upper case.""" + return val.lower() R_NOTIDENT = RX.compile(r'[^a-zA-Z0-9_]+') @defop -def op_c(val): return R_NOTIDENT.sub('_', val) +def op_c(val): + """ + @{COLUMN:c} -- the item, with non-alphanumeric sequences replaced with `_'. + """ + return R_NOTIDENT.sub('_', val) def _pairify(val): + """ + Split VAL into two, at an `=' sign. + + If VAL has the form `THIS=THAT' then return the pair (THIS, THAT); + otherwise return (VAL, VAL). + """ c = val.find('=') if c >= 0: return val[:c], val[c + 1:] else: return val, val @defop -def op_left(val): return _pairify(val)[0] +def op_left(val): + """@{COLUMN:left} -- the left-hand side of the item.""" + return _pairify(val)[0] @defop -def op_right(val): return _pairify(val)[1] +def op_right(val): + """@{COLUMN:right} -- the left-hand side of the item.""" + return _pairify(val)[1] def parse_text(ps): + """ + Parse a chunk of text from a ParseState. + + Stop when we get to something which looks like a template keyword, but + extract tags. Return the resulting template. + + Tags have the form `@COLUMN', or `@{COLUMN:OPERATOR:...}'. The text may + contain comments beginning `%#', which are ignored, and lines beginning + `%%' which have the initial `%' removed and are otherwise treated as normal + text (and, in particular, may contain tags). Other lines beginning with + `%' are directives and must be processed by our caller. + """ + + ## Starting out: no templates collected, and an empty buffer of literal + ## text. tt = [] lit = StringIO() + def spill(): + ## Spill accumulated literal text from `lit' into a LiteralTemplate + ## object. l = lit.getvalue() if l: tt.append(LiteralTemplate(l)) lit.reset() lit.truncate() + + ## Iterate over the lines of input. while True: line = ps.curr + + ## Stop if there's no more text; handle lines beginning with `%'. if line is None: break elif line.startswith('%'): if line.startswith('%#'): ps.step(); continue elif line.startswith('%%'): line = line[1:] else: break + + ## Work through the line, finding tags. i = 0 while True: + + ## If there are no more `@' signs, there can be no more tags, and we're + ## done. j = line.find('@', i) if j < 0: break + + ## Write the chunk we've found. lit.write(line[i:j]) + + ## If the next character is also `@' then this is an escape and we + ## should carry on. + if line[j:].startswith('@@'): + lit.write('@') + i = j + 2 + continue + + ## Parse the tag into a column name, and maybe some operators. m = R_SIMPLETAG.match(line, j) if not m: m = R_COMPLEXTAG.match(line, j) if not m: ps.error('invalid tag') @@ -290,6 +612,8 @@ def parse_text(ps): try: rel, i = COLMAP[col] except KeyError: ps.error("unknown column `%s'" % col) ops = m.lastindex >= 2 and m.group(2) + + ## If we have operators then look them up and compose them. wholeop = None if ops: for opname in ops[1:].split(':'): @@ -297,24 +621,60 @@ def parse_text(ps): except KeyError: ps.error("unknown operation `%s'" % opname) if wholeop is None: wholeop = op else: wholeop = (lambda f, g: lambda x: f(g(x)))(op, wholeop) + + ## Emit a LiteralTemplate for the accumulated text, and a TagTemplate + ## for the tag. spill() tt.append(TagTemplate(rel, i, wholeop)) + + ## Continue from after the tag. i = m.end() + + ## Finished a line. Write out the remainder of the line and move onto + ## the next. lit.write(line[i:]) ps.step() + + ## Run out of things to do. Flush out the rest of the literal text and + ## combine the templates. spill() return SequenceTemplate(tt) +## A dictionary mapping regular expressions to directive-processing functions. DIRECT = [] def direct(rx): + """ + Function decorator for template file directives. + + Associate the regular expression RX with the function in `DIRECT'. + Directive functions are invoked as FUNC(PS, M), where PS is the ParseState, + and M is the match object resulting from matching RX against the directive + text. + """ def _(func): DIRECT.append((RX.compile(rx, RX.VERBOSE), func)) return func return _ def parse_template(ps): + """ + Parse a single template from the ParseState PS. + + A single template is either a chunk of text (parsed by `parse_text') or a + directive (handled by the appropriate function in `DIRECT'). + + Returns either a template object, or a special token. In particular, `EOF' + is returned if we run out of text; directives may return other tokens. + """ + + ## Skip initial comments. Otherwise we might end up with an empty + ## SequenceTemplate here. while ps.curr is not None and ps.curr.startswith('%#'): ps.step() + + ## If we've run out of input, return `EOF' here. A line beginning `%%', or + ## not beginning `%', means we've found a chunk of text. Otherwise find + ## the right directive handler. if ps.curr is None: return EOF elif ps.curr.startswith('%'): if ps.curr.startswith('%%'): return parse_text(ps) @@ -329,6 +689,16 @@ def parse_template(ps): return parse_text(ps) def parse_templseq(ps, nestp): + """ + Parse a sequence of templates from the ParseState PS. + + Calls `parse_template' repeatedly If NESTP is true, then an `END' token + (presumably from a directive handler) is permitted and halts parsing; + otherwise `END' signifies an error. + + Returns a template object. + """ + tt = [] while True: t = parse_template(ps) @@ -343,13 +713,25 @@ def parse_templseq(ps, nestp): @direct(r'repeat') def dir_repeat(ps, m): + """ + %repeat + BODY + %end + + Iterate the body over the cartesian product of the relations mentioned + within. + """ return RepeatTemplate(parse_templseq(ps, True)) @direct(r'end') def dir_end(ps, m): + """%end -- an end marker used to delimet chunks of template.""" return END def compile_template(file, text): + """ + Compile TEXT into a template, attributing errors to FILE. + """ ps = ParseState(file, text) t = parse_templseq(ps, False) return t @@ -378,6 +760,9 @@ for rel in args[1:]: read_thing(rel) filetempl = compile_template('', filepat) def filenames(filetempl): + """ + Generate the filenames in the compiled filename template FILETEMPL. + """ cs = CursorSet() rr = filetempl.relations() for r in rr: @@ -390,6 +775,7 @@ def filenames(filetempl): if not cs.step(): break cs.pop() +## Main dispatch. if opts.mode == 'list': for file, cs in filenames(filetempl): print file elif opts.mode == 'gen':