#! /usr/bin/perl
###
### Construct an APT `sources.list' file
###
### (c) 2012 Mark Wooding
###

###----- Licensing notice ---------------------------------------------------
###
### This program is free software; you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation; either version 2 of the License, or
### (at your option) any later version.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with this program; if not, write to the Free Software Foundation,
### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

use Data::Dumper;
use File::FnMatch qw(:fnmatch);
use Getopt::Long qw(:config gnu_compat bundling no_ignore_case);
use Text::ParseWords;

###--------------------------------------------------------------------------
### Miscellaneous utilities.

(our $QUIS = $0) =~ s:.*/::;
our $VERSION = "1.0.0";

sub fail ($) {
  my ($msg) = @_;
  ## Report a fatal error MSG and exit.

  print STDERR "$QUIS: $msg\n";
  exit 1;
}

sub literalp ($) {
  my ($s) = @_;
  ## Answer whether the string S is free of metacharacters.

  return $s !~ /[\\?*[]/;
}

###--------------------------------------------------------------------------
### Configuration sets.

sub cset_new () {
  ## Return a new configuration set.

  return { "%meta" => { allstarp => 1, ixlist => [], ixmap => {} } };
}

sub cset_indices ($$) {
  my ($cset, $what) = @_;
  ## Return the list of literal indices in the configuration set CSET.  If an
  ## explicit `indices' tag is defined, then use its value (split at
  ## whitespace).  If there are explicit literal indices, return them (in the
  ## correct order).  If all indices are `*', return a single `default' item.
  ## Otherwise report an error.

  if (defined (my $it = $cset->{indices}{"*"})) {
    return shellwords $it;
  } else {
    my $meta = $cset->{"%meta"};
    $meta->{allstarp} and return "default";
    return @{$meta->{ixlist}} if @{$meta->{ixlist}};
    fail "no literal indices for `$what'";
  }
}

sub cset_store ($$$$) {
  my ($cset, $tag, $ix, $value) = @_;
  ## Store VALUE in the configuration set CSET as the value for TAG with
  ## index pattern IX.

  my $meta = $cset->{"%meta"};
  $ix eq "*" or $meta->{allstarp} = 0;
  if (!$meta->{ixmap}{$ix} && literalp $ix) {
    $meta->{ixmap}{$ix} = 1;
    push @{$meta->{ixlist}}, $ix;
  }
  $cset->{$tag}{$ix} = $value;
}

sub cset_expand (\@$$);
%PAINT = ();

sub cset_lookup (\@$$;$) {
  my ($cset, $tag, $ix, $mustp) = @_;
  ## Look up TAG in the CSETs using index IX.  Return the value corresponding
  ## to the most specific match to IX in the earliest configuration set in
  ## the list.  If no set contains a matching value at all, then the
  ## behaviour depends on MUSTP: if true, report an error; if false, return
  ## undef.

  if ($PAINT{$tag}) { fail "recursive expansion of `\%${tag}[$ix]'"; }
  my $val = undef;
  CSET: for my $cs (@$cset) {
    defined (my $tset = $cs->{$tag}) or next CSET;
    if (defined (my $it = $tset->{$ix})) { $val = $it; last CSET; };
    my $pat = undef;
    PAT: while (my ($p, $v) = each %$tset) {
      fnmatch $p, $ix or next PAT;
      unless (defined($pat) && fnmatch($p, $pat)) { $val = $v; $pat = $p; }
    }
    last CSET if defined $val;
  }
  if (defined $val) {
    local $PAINT{$tag} = 1;
    my $exp = cset_expand @$cset, $ix, $val;
    return $exp;
  } elsif ($mustp) { fail "variable `$tag\[$ix]' undefined"; }
  else { return undef; }
}

sub cset_fetch (\%\@$$@) {
  my ($a, $cset, $mustp, $ix, @tag) = @_;
  ## Populate the hash A with values retrieved from the CSETs.  Each TAG is
  ## looked up with index IX, and if a value is found, it is stored in A with
  ## key TAG.  If MUSTP is true, then an error is reported unless a value is
  ## found for every TAG.

  for my $tag (@tag) {
    my $v = cset_lookup @$cset, $tag, $ix, $mustp;
    $a->{$tag} = $v if defined $v;
  }
}

sub cset_expand (\@$$) {
  my ($cset, $ix, $s) = @_;
  ## Expand placeholders %TAG or %{TAG} in the string S, relative to the
  ## CSETs and the index IX.

  $s =~ s{
    % (?:    (?P<NAME>\w+)
	| \{ (?P<NAME>\w+) \} )
  }{
    cset_lookup(@$cset, $+{NAME}, $ix, 1)
  }xeg;
  return $s;
}

###--------------------------------------------------------------------------
### Parsing.

our %DEFAULT = %{+cset_new};		# Default assignments.
our %CSET = ();				# Map of distro configuration sets.
our @SUB = ();				# List of subscriptions.

sub parse ($) {
  my ($fn) = @_;
  ## Parse the file named by FN and add definitions to the tables %DEFAULT,
  ## %CSET and @SUB.

  ## Open the file and prepare to read.
  open my $fh, "<", $fn or fail "open `$fn': $!";
  my $ln = 0;

  ## Report a syntax error, citing the offending file and line.
  my $syntax = sub { fail "$fn:$ln: $_[0]" };

  ## Report an error about an indented line with no stanza header.
  my $nomode = sub { $syntax->("missing stanza header") };
  my $mode = $nomode;

  ## Parse an assignment LINE and store it in CSET.
  my $assign = sub {
    my ($cset, $line) = @_;
    $line =~ m{
      ^ \s*
      (?P<TAG> \w+)
      (?: \[ (?P<IX> [^\]]+) \] )?
      \s* = \s*
      (?P<VALUE> | \S | \S.*\S)
      \s* $
    }x or $syntax->("invalid assignment");
    cset_store $cset, $+{TAG}, $+{IX} // "*", $+{VALUE};
  };

  ## Parse a subscription LINE and store it in @SUB.
  my $subscribe = sub {
    my ($line) = @_;
    my @w = shellwords $line;
    my @dist = ();
    while (my $w = shift @w) { last if $w eq ":"; push @dist, $w; }
    @w and @dist or $syntax->("empty distribution or release list");
    push @SUB, [\@dist, \@w];
  };

  for (;;) {

    ## Read a line.  If it's empty or a comment then ignore it.
    defined (my $line = readline $fh)
      or last;
    $ln++;
    next if $line =~ /^\s*($|\#)/;
    chomp $line;

    ## If the line begins with whitespace then process it according to the
    ## prevailing mode.
    if ($line =~ /^\s/) {
      $mode->($line);
      next;
    }

    ## Split the header line into tokens and determine an action.
    my @w = shellwords $line;
    $mode = $nomode;
    if ($w[0] eq "distribution") {
      @w == 2 or $syntax->("usage: distribution NAME");
      my $cset = $CSET{$w[1]} //= cset_new;
      $mode = sub { $assign->($cset, @_) };
    } elsif ($w[0] eq "default") {
      @w == 1 or $syntax->("usage: default");
      $mode = sub { $assign->(\%DEFAULT, @_) };
    } elsif ($w[0] eq "subscribe") {
      @w == 1 or $syntax->("usage: subscribe");
      $mode = $subscribe;
    } else {
      $syntax->("unknown toplevel directive `$w[0]'");
    }
  }

  ## Done.  Make sure we read everything.
  close $fh or die "read `$fn': $!";
}

###--------------------------------------------------------------------------
### Main program.

our $USAGE = "usage: $QUIS FILE|DIR ...";
sub version { print "$QUIS, version $VERSION\n"; }
sub help {
  print <<EOF;
$USAGE

Options:
  -h, --help		Show this help text.
  -v, --version		Show the program version number.
EOF
}

GetOptions('help|h|?'		 => sub { version; help; exit; },
	   'version|v'		 => sub { version; exit; })
  and @ARGV >= 1
  or do { print STDERR $USAGE, "\n"; exit 1; };

## Read the input files.
for my $fn (@ARGV) {
  if (-d $fn) {
    opendir my $dh, $fn or fail "opendir `$fn': $!";
    my @f = ();
    FILE: while (my $f = readdir $dh) {
      $f =~ /^[-\w.]+$/ or next FILE;
      $f = "$fn/$f";
      -f $f or next FILE;
      push @f, $f;
    }
    closedir $dh;
    for my $f (sort @f) { parse $f; }
  } else {
    parse $fn;
  }
}

## Start writing output.
print <<EOF;
### -*-conf-*- GENERATED by $QUIS: DO NOT EDIT!
###
### Package sources.

EOF

## Work through the subscription list.
for my $pair (@SUB) {
  my @dist = @{$pair->[0]};
  my @rel = @{$pair->[1]};

  ## Write a stanza for each distribution.
  for my $dist (@dist) {

    ## Find the configuration set for the distribution.
    defined (my $cset = $CSET{$dist})
      or fail "unknown distribution `$dist'";
    my @ix = cset_indices $cset, $dist;

    ## Print a banner to break up the monotony.
    my %a = ();
    cset_fetch %a, @{[$cset, \%DEFAULT]}, 0, "default", qw(banner);
    print "###", "-" x 74, "\n";
    print "### ", $a{banner}, "\n" if exists $a{banner};

    ## Write a paragraph for each release.
    for my $rel (@rel) {

      ## Write a header.
      print "\n## $rel\n";

      ## Prepare a list of configuration sections to provide variables for
      ## expansion.
      my @cset = ({ RELEASE => { "*" => $rel } }, $cset, \%DEFAULT);

      ## Work through each index.
      IX: for my $ix (@ix) {

	## Fetch properties from the configuration set.
	%a = (options	=> undef,
	      release	=> $rel,
	      releases	=> "*",
	      types	=> "deb deb-src");
	cset_fetch %a, @cset, 1, $ix, qw(uri components);
	cset_fetch %a, @cset, 0, $ix, qw(types options release releases);

	## Check that this release matches the index.
	my $matchp = 0;
	for my $rpat (shellwords $a{releases}) {
	  $matchp = 1, last if fnmatch $rpat, $rel;
	}
	next IX unless $matchp;

	## Build an output line.
	my $out = "";
	if (defined (my $opt = $a{options})) { $out .= "[ $opt ] "; }
	$out .= "$a{uri} $a{release} $a{components}";

	## Canonify whitespace.
	$out =~ s/^\s+//; $out =~ s/\s+$//; $out =~ s/\s+/ /;

	## Write out the necessary
	print "$_ $out\n" for shellwords $a{types};
      }
    }
    print "\n";
  }
}

## Write a trailer.
print "###----- That's all, folks ", "-" x 50, "\n";
print "### GENERATED by $QUIS: NO NOT EDIT!\n";

###--------------------------------------------------------------------------
### Documentation.

=head1 NAME

mkaptsrc - generate APT `sources.list' files

=head1 SYNOPSIS

B<mkaptsrc> I<file>|I<dir>...

=head1 DESCRIPTION

The B<mkaptsrc> progrem generates an APT F<sources.list> file from a
collection of configuration files.  It allows a site to use a single master
file defining all (or most) of the available archives, while allowing each
individiual host to describe succinctly which archives it actually wants to
track.

The command line arguments are a list of one or more filenames and/or
directories.  The program reads the files one by one, in order; a directory
stands for all of the regular files it contains whose names consist only of
alphanumeric characters, dots C<.>, underscores C<_>, and hyphens C<->, in
ascending lexicographic order.  (Nested subdirectories are ignored.)  Later
files can override settings from earlier ones.

=head2 Command-line syntax

The following command-line options are recognized.

=over

=item B<-h>, B<--help>

Print help about the program to standard output, and exit.

=item B<-v>, B<--version>

Print B<mkaptsrc>'s version number to standard output, and exit.

=back

=head2 Configuration syntax

The configuration files are split into stanze.  Each stanza begins with an
unindented header line, followed by zero or more indented body lines.  Blank
lines (containing only whitespace) and comments (whose first non-whitespace
character is C<#>) are ignored E<ndash> and in particular are not considered
when determining the boundaries of stanze.  It is not possible to split a
stanza between two files.

A I<distribution stanza> consists of a line

=over

B<distribution> I<dist>

=back

followed by a number of indented assignments

=over

I<tag> = I<value>

=back

or

=over

I<tag>B<[>I<pat>B<]> = I<value>

=back

Here, I<dist> is a name for this distribution; this name is entirely internal
to the configuration and has no external meaning.  Several stanze may use the
same I<dist>: the effect is the same as a single big stanza containing all of
the assignments in order.

Each assignment line sets the value of a I<tag> for the distribution; if the
I<tag> has already been assigned a value then the old value is forgotten.
The optional I<pat> may be used to assign different values to the same tag
according to different I<contexts>, distinguished by glob patterns: see the
description below.  Omitting the I<pat> is equivalent to using the wildcard
pattern C<*>.

A I<default stanza> consists of a line

=over

B<defaults>

=back

followed by assignments as for a distribution stanza.  Again, there may be
many default stanze, and the effect is the same as a single big default
stanza containing all of the assignments in order.  During output, tags are
looked up first in the relevant distribution, and if there no matching
assignments then the B<defaults> assignments are searched.

A I<subscription stanza> consists of a line

=over

B<subscribe>

=back

followed by indented subscription lines

=over

I<dist> [I<dist> ...] B<:> I<release> [I<release> ...]

=back

Such a line is equivalent to a sequence of lines

=over

I<dist> B<:> I<release> [I<release> ...]

=back

one for each I<dist>, in order.

It is permitted for several lines to name the same I<dist>, though currently
the behaviour is not good: they are treated entirely independently.  The
author is not sure what the correct behaviour ought to be.

=head2 Tag lookup and value expansion

The output of B<mkaptsrc> is largely constructed by looking up tags and using
their values.  A tag is always looked up in a particular I<distribution> and
with reference to a particular I<context>.  Contexts are named with an
I<index>.  The resulting value is the last assignment in the distribution's
stanze whose tag is equal to the tag being looked up, and whose pattern is
either absent or matches the context index.  If there is no matching
assignment, then the default assignments are checked, and again the last
match is used.  If there is no default assignment either, then the lookup
fails; this might or might not be an error.

Once the value has been found, it is I<expanded> before use.  Any
placeholders of the form B<%>I<tag> or B<%{>I<tag>B<}> (the latter may be
used to distinguish the I<tag> name from any immediately following text) are
replaced by the (expanded) value of the I<tag>, using the same distribution
and context as the original lookup.  It is a fatal error for a lookup of a
tag to fail during expansion.  Recursive expansion is forbidden.

There are some special tags given values by B<mkaptsrc>.  Their names are
written in all upper-case.

=head2 Output

The output is always written to stdout.  It begins with a header comment
(which you can't modify), including a warning that the file is generated and
shouldn't be edited.

The output is split into sections, one for each I<dist> in the subcription
stanze.  Each section begins with a comment banner, whose text is the result
of looking up the tag B<banner> in the distribution, using the context index
B<default>; if the lookup fails then no banner text is added.

The distribution section is split into paragraphs, one for each I<release>
listed in the subscription line, and headed with a comment naming the
I<release>.  The contents of the paragraph are determined by assignments in
the distribution stanza for I<dist>.

The set of context indices for the paragraph is determined, as follows.

=over

=item *

The tag B<indices> is looked up in the distribution I<dist>.  This lookup is
special in three ways: firstly, lookup will I<not> fall back to the
B<defaults> assignments; secondly, only assignments with no pattern (or,
equivalently, with pattern C<*>) are examined; and, thirdly, the result is
I<not> subject to expansion.  If a value is found, then the context indices
are precisely the space-separated words of the value.

=item *

If there assignments in the distribution I<dist> whose patterns are
I<literal> E<ndash> i.e., contain no metacharacters C<*>, C<?>, C<[>, or
C<\\> E<ndash> then the context indices are precisely these literal patterns,
in the order in which they first appeared.

=item *

If all of the assignments for the distribution I<dist> have no pattern (or,
equivalently, have pattern C<*>), then there is exactly one context index
B<default>.

=item *

Otherwise the situation is a fatal error.  You should resolve this unlikely
situation by setting an explicit B<indices> value.

=back

The contexts are now processed in turn.  Each lookup described below happens
in the distribution I<dist>, with respect to the context being processed.
Furthermore, the special tag B<RELEASE> is given the value I<release>.

The tag B<releases> is looked up, and split into a space-separated sequence
of glob patterns.  If the I<release> doesn't match any of these patterns then
the context is ignored.  (If the lookup fails, the context is always used,
as if the value had been C<*>.)

Finally, a sequence of lines is written, of the form

=over

I<type> S<B<[> I<options> B<]>> I<uri> I<release> I<components>

=back

one for each word in the value of B<types>, defaulting to B<deb> B<deb-src>.
Other pieces correspond to the values of tags to be looked up: I<release>
defaults to the name provided in the B<subscribe> stanza; if I<options> is
omitted then there will be no S<B<[> I<options> B<]>> piece; it is a fatal
error if other lookups fail.

=head1 EXAMPLES

The package repository for the official Linux Spotify client can be described
as follows.

	distribution spotify
	  banner = Spotify client for Linux.
	  uri = http://repository.spotify.com/
	  components = non-free
	  types = deb

	subscribe
	  spotify : stable

This produces the output

	###------------------------------------------------------------
	### Spotify client for Linux.

	## stable
	deb http://repository.spotify.com/ stable non-free

As a more complex example, I describe the official Debian package archive as
follows.

	default
	  debmirror = http://mirror.distorted.org.uk
	  debsecurity = http://security.debian.org

	distribution debian
	  banner = Debian GNU/Linux.
	  uri[base] = %debmirror/debian/
	  uri[security-local] = %debmirror/debian-security/
	  uri[security-upstream] = %debsecurity/debian-security/
	  release[security-*] = %RELEASE/updates
	  releases[security-*] = oldstable stable testing
	  components = main non-free contrib
	  components[security-*] = main

	subscribe
	  debian : stable testing unstable

This arranges to use my local mirror for both the main archive and for
security updates, but I<also> to use the upstream archive for security
updates which I might not have mirrored yet.  Setting B<releases[security-*]>
copes with the fact that there are no separate security releases for the
B<unstable> release.

On machines which are far away from my mirror, I override these settings by
writing

	distribution debian
	  debmirror = http://ftp.uk.debian.org
	  indices = base security-upstream

in a host-local file (which has the effect of disabling the B<security-local>
context implicitly defined in the base stanza.

=head1 BUGS

Redefinition of subscriptions currently isn't well behaved.

=head1 SEE ALSO

L<sources.list(5)>

=head1 AUTHOR

Mark Wooding <mdw@distorted.org.uk>

=cut

###----- That's all, folks --------------------------------------------------