X-Git-Url: https://git.distorted.org.uk/~mdw/runlisp/blobdiff_plain/7b8ff279e7304e41b243459d78c3b6703bb8c3f5..47af3df3850a1a17ee706b278868309d40bacd55:/lib.c diff --git a/lib.c b/lib.c index 104eebd..b509cea 100644 --- a/lib.c +++ b/lib.c @@ -40,26 +40,23 @@ #include "lib.h" -/*----- Miscellany --------------------------------------------------------*/ - -int str_lt(const char *a, size_t an, const char *b, size_t bn) -{ - if (an < bn) return (MEMCMP(a, <=, b, an)); - else return (MEMCMP(a, <, b, bn)); -} - /*----- Diagnostic utilities ----------------------------------------------*/ const char *progname = "???"; + /* Our program name, for use in error messages. */ +/* Set `progname' from the pathname in PROG (typically from `argv[0]'). */ void set_progname(const char *prog) { const char *p; p = strrchr(prog, '/'); - progname = p ? p + 1 : progname; + progname = p ? p + 1 : prog; } +/* Report an error or warning in Unix style, given a captured argument + * cursor. + */ void vmoan(const char *msg, va_list ap) { fprintf(stderr, "%s: ", progname); @@ -67,14 +64,21 @@ void vmoan(const char *msg, va_list ap) fputc('\n', stderr); } +/* Issue a warning message. */ void moan(const char *msg, ...) { va_list ap; va_start(ap, msg); vmoan(msg, ap); va_end(ap); } +/* Issue a fatal error message and exit unsuccessfully. */ void lose(const char *msg, ...) { va_list ap; va_start(ap, msg); vmoan(msg, ap); va_end(ap); exit(127); } /*----- Memory allocation -------------------------------------------------*/ +/* Allocate and return a pointer to N bytes, or report a fatal error. + * + * Release the pointer using `free' as usual. If N is zero, returns null + * (but you are not expected to check for this). + */ void *xmalloc(size_t n) { void *p; @@ -84,6 +88,13 @@ void *xmalloc(size_t n) return (p); } +/* Resize the block at P (from `malloc' or `xmalloc') to be N bytes long. + * + * The block might (and probably will) move, so it returns the new address. + * If N is zero, then the block is freed (if necessary) and a null pointer + * returned; otherwise, if P is null then a fresh block is allocated. If + * allocation fails, then a fatal error is reported. + */ void *xrealloc(void *p, size_t n) { if (!n) { free(p); return (0); } @@ -92,6 +103,11 @@ void *xrealloc(void *p, size_t n) return (p); } +/* Allocate and return a copy of the N-byte string starting at P. + * + * The new string is null-terminated, though P need not be. If allocation + * fails, then a fatal error is reported. + */ char *xstrndup(const char *p, size_t n) { char *q = xmalloc(n + 1); @@ -100,14 +116,24 @@ char *xstrndup(const char *p, size_t n) return (q); } +/* Allocate and return a copy of the null-terminated string starting at P. + * + * If allocation fails, then a fatal error is reported. + */ char *xstrdup(const char *p) { return (xstrndup(p, strlen(p))); } /*----- Dynamic strings ---------------------------------------------------*/ +/* Initialize the string D. + * + * Usually you'd use the static initializer `DSTR_INIT'. + */ void dstr_init(struct dstr *d) { d->p = 0; d->len = d->sz = 0; } +/* Reset string D so it's empty again. */ void dstr_reset(struct dstr *d) { d->len = 0; } +/* Ensure that D has at least N unused bytes available. */ void dstr_ensure(struct dstr *d, size_t n) { size_t need = d->len + n, newsz; @@ -118,11 +144,25 @@ void dstr_ensure(struct dstr *d, size_t n) d->p = xrealloc(d->p, newsz); d->sz = newsz; } +/* Release the memory held by D. + * + * It must be reinitialized (e.g., by `dstr_init') before it can be used + * again. + */ void dstr_release(struct dstr *d) { free(d->p); } +/* Append the N-byte string at P to D. + * + * P need not be null-terminated. D will not be null-terminated + * afterwards. + */ void dstr_putm(struct dstr *d, const void *p, size_t n) { dstr_ensure(d, n); memcpy(d->p + d->len, p, n); d->len += n; } +/* Append the null-terminated string P to D. + * + * D /is/ guaranteed to be null-terminated after this. + */ void dstr_puts(struct dstr *d, const char *p) { size_t n = strlen(p); @@ -132,15 +172,33 @@ void dstr_puts(struct dstr *d, const char *p) d->len += n; } +/* Append the single character CH to D. + * + * D will not be null-terminated afterwards. + */ void dstr_putc(struct dstr *d, int ch) { dstr_ensure(d, 1); d->p[d->len++] = ch; } +/* Append N copies of the character CH to D. + * + * D will not be null-terminated afterwards. + */ void dstr_putcn(struct dstr *d, int ch, size_t n) { dstr_ensure(d, n); memset(d->p + d->len, ch, n); d->len += n; } +/* Null-terminate the string D. + * + * This doesn't change the length of D. If further stuff is appended then + * the null terminator will be overwritten. + */ void dstr_putz(struct dstr *d) { dstr_ensure(d, 1); d->p[d->len] = 0; } +/* Append stuff to D, determined by printf(3) format string P and argument + * tail AP. + * + * D will not be null-terminated afterwards. + */ void dstr_vputf(struct dstr *d, const char *p, va_list ap) { va_list ap2; @@ -158,9 +216,20 @@ void dstr_vputf(struct dstr *d, const char *p, va_list ap) d->len += n; } +/* Append stuff to D, determined by printf(3) format string P and arguments. + * + * D will not be null-terminated afterwards. + */ PRINTF_LIKE(2, 3) void dstr_putf(struct dstr *d, const char *p, ...) { va_list ap; va_start(ap, p); dstr_vputf(d, p, ap); va_end(ap); } +/* Append the next input line from FP to D. + * + * Return 0 on success, or -1 if reading immediately fails or encounters + * end-of-file (call ferror(3) to distinguish). Any trailing newline is + * discarded: it is not possible to determine whether the last line was ended + * with a newline. D is guaranteed to be null-terminated afterwards. + */ int dstr_readline(struct dstr *d, FILE *fp) { size_t n; @@ -180,11 +249,17 @@ int dstr_readline(struct dstr *d, FILE *fp) /*----- Dynamic vectors of strings ----------------------------------------*/ +/* Initialize the vector AV. + * + * Usually you'd use the static initializer `ARGV_INIT'. + */ void argv_init(struct argv *av) { av->v = 0; av->o = av->n = av->sz = 0; } +/* Reset the vector AV so that it's empty again. */ void argv_reset(struct argv *av) { av->n = 0; } +/* Ensure that AV has at least N unused slots at the end. */ void argv_ensure(struct argv *av, size_t n) { size_t need = av->n + av->o + n, newsz; @@ -192,12 +267,11 @@ void argv_ensure(struct argv *av, size_t n) if (need <= av->sz) return; newsz = av->sz ? 2*av->sz : 8; while (newsz < need) newsz *= 2; - av->v = - (const char **)xrealloc(av->v - av->o, newsz*sizeof(const char *)) + - av->o; + av->v = xrealloc(av->v - av->o, newsz*sizeof(char *)); av->v += av->o; av->sz = newsz; } +/* Ensure that AV has at least N unused slots at the /start/. */ void argv_ensure_offset(struct argv *av, size_t n) { size_t newoff; @@ -210,59 +284,81 @@ void argv_ensure_offset(struct argv *av, size_t n) newoff = 16; while (newoff < n) newoff *= 2; argv_ensure(av, newoff - av->o); - memmove(av->v + newoff - av->o, av->v, av->n*sizeof(const char *)); + memmove(av->v + newoff - av->o, av->v, av->n*sizeof(char *)); av->v += newoff - av->o; av->o = newoff; } +/* Release the memory held by AV. + * + * It must be reinitialized (e.g., by `argv_init') before it can be used + * again. + */ void argv_release(struct argv *av) { free(av->v - av->o); } -void argv_append(struct argv *av, const char *p) +/* Append the pointer P to AV. */ +void argv_append(struct argv *av, char *p) { argv_ensure(av, 1); av->v[av->n++] = p; } +/* Append a null pointer to AV, without extending the vactor length. + * + * The null pointer will be overwritten when the next string is appended. + */ void argv_appendz(struct argv *av) { argv_ensure(av, 1); av->v[av->n] = 0; } -void argv_appendn(struct argv *av, const char *const *v, size_t n) +/* Append a N-element vector V of pointers to AV. */ +void argv_appendn(struct argv *av, char *const *v, size_t n) { argv_ensure(av, n); memcpy(av->v + av->n, v, n*sizeof(const char *)); av->n += n; } +/* Append the variable-length vector BV to AV. */ void argv_appendav(struct argv *av, const struct argv *bv) { argv_appendn(av, bv->v, bv->n); } +/* Append the pointers from a variable-length argument list AP to AV. + * + * The list is terminated by a null pointer. + */ void argv_appendv(struct argv *av, va_list ap) { - const char *p; - - for (;;) - { p = va_arg(ap, const char *); if (!p) break; argv_append(av, p); } + char *p; + for (;;) { p = va_arg(ap, char *); if (!p) break; argv_append(av, p); } } +/* Append the argument pointers, terminated by a null pointer, to AV. */ void argv_appendl(struct argv *av, ...) { va_list ap; va_start(ap, av); argv_appendv(av, ap); va_end(ap); } -void argv_prepend(struct argv *av, const char *p) +/* Prepend the pointer P to AV. */ +void argv_prepend(struct argv *av, char *p) { argv_ensure_offset(av, 1); *--av->v = p; av->o--; av->n++; } -void argv_prependn(struct argv *av, const char *const *v, size_t n) +/* Prepend a N-element vector V of pointers to AV. */ +void argv_prependn(struct argv *av, char *const *v, size_t n) { argv_ensure_offset(av, n); av->o -= n; av->v -= n; av->n += n; memcpy(av->v, v, n*sizeof(const char *)); } +/* Prepend the variable-length vector BV to AV. */ void argv_prependav(struct argv *av, const struct argv *bv) { argv_prependn(av, bv->v, bv->n); } +/* Prepend the pointers from a variable-length argument list AP to AV. + * + * The list is terminated by a null pointer. + */ void argv_prependv(struct argv *av, va_list ap) { - const char *p, **v; + char *p, **v; size_t n = 0; for (;;) { - p = va_arg(ap, const char *); if (!p) break; + p = va_arg(ap, char *); if (!p) break; argv_prepend(av, p); n++; } v = av->v; @@ -272,31 +368,85 @@ void argv_prependv(struct argv *av, va_list ap) } } +/* Prepend the argument pointers, terminated by a null pointer, to AV. */ void argv_prependl(struct argv *av, ...) { va_list ap; va_start(ap, av); argv_prependv(av, ap); va_end(ap); } /*----- Treaps ------------------------------------------------------------*/ +/* Return nonzero if the AN-byte string A is strictly precedes the BN-byte + * string B in a lexicographic ordering. + * + * All comparison of keys is handled by this function. + */ +static int str_lt(const char *a, size_t an, const char *b, size_t bn) +{ + /* This is a little subtle. We need only compare the first N bytes of the + * strings, where N is the length of the shorter string. If this + * distinguishes the two strings, then we're clearly done. Otherwise, if + * the prefixes are equal then the shorter string is the smaller one. If + * the two strings are the same length, then they're equal. + * + * Hence, if A is the strictly shorter string, then A precedes B if A + * precedes or matches the prefix of B; otherwise A only precedes B if A + * strictly precedes the prefix of B. + */ + if (an < bn) return (MEMCMP(a, <=, b, an)); + else return (MEMCMP(a, <, b, bn)); +} + +/* Initialize the treap T. + * + * Usually you'd use the static initializer `TREAP_INIT'. + */ void treap_init(struct treap *t) { t->root = 0; } +/* Look up the KN-byte key K in the treap T. + * + * Return a pointer to the matching node if one was found, or null otherwise. + */ void *treap_lookup(const struct treap *t, const char *k, size_t kn) { struct treap_node *n = t->root, *candidate = 0; - while (n) { + /* This is a simple prototype for some of the search loops we'll encounter + * later. Notice that we use a strict one-sided comparison, rather than + * the more conventional two-sided comparison. + * + * The main loop will find the largest key not greater than K. + */ + while (n) + /* Compare the node's key against our key. If the node is too large, + * then we ignore it and move left. Otherwise remember this node for + * later, and move right to see if we can find a better, larger node. + */ + if (str_lt(k, kn, n->k, n->kn)) n = n->left; else { candidate = n; n = n->right; } - } + + /* If the candidate node is less than our key then we failed. Otherwise, + * by trichotomy, we have found the correct node. + */ if (!candidate || str_lt(candidate->k, candidate->kn, k, kn)) return (0); return (candidate); } +/* Look up the KN-byte K in the treap T, recording a path in P. + * + * This is similar to `treap_lookup', in that it returns the requested node + * if it already exists, or null otherwise, but it also records in P + * information to be used by `treap_insert' to insert a new node with the + * given key if it's not there already. + */ void *treap_probe(struct treap *t, const char *k, size_t kn, struct treap_path *p) { struct treap_node **nn = &t->root, *candidate = 0; unsigned i = 0; + /* This walk is similar to `treap_lookup' above, except that we also record + * the address of each node pointer we visit along the way. + */ for (;;) { assert(i < TREAP_PATHMAX); p->path[i++] = nn; if (!*nn) break; @@ -304,10 +454,16 @@ void *treap_probe(struct treap *t, const char *k, size_t kn, else { candidate = *nn; nn = &(*nn)->right; } } p->nsteps = i; + + /* Check to see whether we found the right node. */ if (!candidate || str_lt(candidate->k, candidate->kn, k, kn)) return (0); return (candidate); } +/* Insert a new node N into T, associating it with the KN-byte key K. + * + * Use the path data P, from `treap_probe', to help with insertion. + */ void treap_insert(struct treap *t, const struct treap_path *p, struct treap_node *n, const char *k, size_t kn) { @@ -315,46 +471,166 @@ void treap_insert(struct treap *t, const struct treap_path *p, struct treap_node **nn, **uu, *u; unsigned wt; + /* Fill in the node structure. */ n->k = xstrndup(k, kn); n->kn = kn; n->wt = wt = rand(); n->left = n->right = 0; + + /* Prepare for the insertion. + * + * The path actually points to each of the links traversed when searching + * for the node, starting with the `root' pointer, then the `left' or + * `right' pointer of the root node, and so on; `nsteps' will always be + * nonzero, since the path will always pass through the root, and the final + * step, `path->path[path->nsteps - 1]' will always be the address of a + * null pointer onto which the freshly inserted node could be hooked in + * order to satisfy the binary-search-tree ordering. (Of course, this will + * likely /not/ satisfy the heap condition, so more work needs to be done.) + * + * Throughout, NN is our current candidate for where to attach the node N. + * As the loop progresses, NN will ascend to links further up the tree, and + * N will be adjusted to accumulate pieces of the existing tree structure. + * We'll stop when we find that the parent node's weight is larger than our + * new node's weight, at which point we can just set *NN = N; or if we run + * out of steps in the path, in which case *NN is the root pointer. + */ assert(i); nn = p->path[--i]; while (i--) { + + /* Collect the next step in the path, and get the pointer to the node. */ uu = p->path[i]; u = *uu; + + /* If this node's weight is higher, then we've found the right level and + * we can stop. + */ if (wt <= u->wt) break; + + /* The node U is lighter than our new node N, so we must rotate in order + * to fix things. If we were currently planning to hook N as the left + * subtree of U, then we rotate like this: + * + * | | + * U (N) + * / \ / \ + * (N) Z ---> X U + * / \ / \ + * X Y Y Z + * + * On the other hand, if we were planning to hook N as the right subtree + * of U, then we do the opposite rotation: + * + * | | + * U (N) + * / \ / \ + * X (N) ---> U Z + * / \ / \ + * Y Z X Y + * + * These transformations clearly preserve the ordering of nodes in the + * binary search tree, and satisfy the heap condition in the subtree + * headed by N. + */ if (nn == &u->left) { u->left = n->right; n->right = u; } else { u->right = n->left; n->left = u; } + + /* And this arrangement must be attached to UU, or some higher attachment + * point. The subtree satisfies the heap condition, and can be attached + * safely at the selected place. + */ nn = uu; } + + /* We've found the right spot. Hook the accumulated subtree into place. */ *nn = n; } +/* Remove the node with the KN-byte K from T. + * + * Return the address of the node we removed, or null if it couldn't be + * found. + */ void *treap_remove(struct treap *t, const char *k, size_t kn) { struct treap_node **nn = &t->root, **candidate = 0, *n, *l, *r; - while (*nn) { + /* Search for the matching node, but keep track of the address of the link + * which points to our target node. + */ + while (*nn) if (str_lt(k, kn, (*nn)->k, (*nn)->kn)) nn = &(*nn)->left; else { candidate = nn; nn = &(*nn)->right; } - } + + /* If this isn't the right node then give up. */ if (!candidate || str_lt((*candidate)->k, (*candidate)->kn, k, kn)) return (0); - n = *candidate; l = n->left; r = n->right; - for (;;) { - if (l && (!r || l->wt > r->wt)) { nn = &l->right; l = l->right; } - else if (r) { nn = &r->left; r = r->left; } - else break; - } - *nn = 0; - free(n->k); - return (n); + /* Now we need to disentangle the node from the tree. This is essentially + * the reverse of insertion: we pretend that this node is suddenly very + * light, and mutate the tree so as to restore the heap condition until + * eventually our node is a leaf and can be cut off without trouble. + * + * Throughout, the link *NN notionally points to N, but we don't actually + * update it until we're certain what value it should finally take. + */ + nn = candidate; n = *nn; l = n->left; r = n->right; + for (;;) + + /* If its left subtree is empty then we can replace our node by its right + * subtree and be done. Similarly, if the right subtree is empty then we + * replace the node by its left subtree. + * + * | | | | + * (N) ---> R ; (N) ---> L + * / \ / \ + * * R L * + */ + if (!l) { *nn = r; break; } + else if (!r) { *nn = l; break; } + + /* Otherwise we need to rotate the pointers so that the heavier of the + * two children takes the place of our node; thus we have either + * + * | | + * (N) L + * / \ / \ + * L R ---> X (N) + * / \ / \ + * X Y Y R + * + * or + * + * | | + * (N) R + * / \ / \ + * L R ---> (N) Y + * / \ / \ + * X Y L X + * + * Again, these transformations clearly preserve the ordering of nodes in + * the binary search tree, and the heap condition. + */ + else if (l->wt > r->wt) + { *nn = l; nn = &l->right; l = n->left = l->right; } + else + { *nn = r; nn = &r->left; r = n->right = r->left; } + + /* Release the key buffer, and return the node that we've now detached. */ + free(n->k); return (n); } +/* Initialize an iterator I over T's nodes. */ void treap_start_iter(struct treap *t, struct treap_iter *i) { struct treap_node *n = t->root; unsigned sp = 0; + /* The `stack' in the iterator structure is an empty ascending stack of + * nodes which have been encountered, and their left subtrees investigated, + * but not yet visited by the iteration. + * + * Iteration begins by stacking the root node, its left child, and so on, + * At the end of this, the topmost entry on the stack is the least node of + * the tree, followed by its parent, grandparent, and so on up to the root. + */ while (n) { assert(sp < TREAP_PATHMAX); i->stack[sp++] = n; n = n->left; @@ -362,12 +638,73 @@ void treap_start_iter(struct treap *t, struct treap_iter *i) i->sp = sp; } +/* Return the next node from I, in ascending order by key. + * + * If there are no more nodes, then return null. + */ void *treap_next(struct treap_iter *i) { struct treap_node *n, *o; unsigned sp = i->sp; + /* We say that a node is /visited/ once it's been returned by this + * iterator. To traverse a tree in order, then, we traverse its left + * subtree, visit the tree root, and traverse its right subtree -- which is + * a fine recursive definition, but we need a nonrecursive implementation. + * + * As is usual in this kind of essential structural recursion, we maintain + * a stack. The invariant that we'll maintain is as follows. + * + * 1. If the stack is empty, then all nodes have been visited. + * + * 2, If the stack is nonempty then the topmost entry on the stack is the + * least node which has not yet been visited -- and therefore is the + * next node to visit. + * + * 3. The earlier entries in the stack are, in (top to bottom) order, + * those of the topmost node's parent, grandparent, etc., up to the + * root, which have not yet been visited. More specifically, a node + * appears in the stack if and only if some node in its left subtree + * is nearer the top of the stack. + * + * When we initialized the iterator state (in `treap_start_iter' above), we + * traced a path to the leftmost leaf, stacking the root, its left-hand + * child, and so on. The leftmost leaf is clearly the first node to be + * visited, and its entire ancestry is on the stack since none of these + * nodes has yet been visited. (If the tree is empty, then we have done + * nothing, the stack is empty, and there are no nodes to visit.) This + * establishes the base case for the induction. + */ + + /* So, if the stack is empty now, then (1) all of the nodes have been + * visited and there's nothing left to do. Return null. + */ if (!sp) return (0); + + /* It's clear that, if we pop the topmost element of the stack, visit it, + * and arrange to reestablish the invariant, then we'll visit the nodes in + * the correct order, pretty much by definition. + * + * So, pop a node off the stack. This is the node we shall return. But + * before we can do that, we must reestablish the above invariant. + * Firstly, the current node is removed from the stack, because we're about + * to visit it, and visited nodes don't belong on the stack. Then there + * are two cases to consider. + * + * * If the current node's right subtree is not empty, then the next node + * to be visited is the leftmost node in that subtree. All of the + * nodes on the stack are ancestors of the current node, and the right + * subtree consists of its descendants, so none of them are already on + * the stack; and they're all greater than the current node, and + * therefore haven't been visited. Therefore, we must push the current + * node's right child, its /left/ child, and so on, proceeding + * leftwards until we fall off the bottom of the tree. + * + * * Otherwise, we've finished traversing some subtree. Either we are + * now done, or (3) we have just finished traversing the left subtree + * of the next topmost item on the stack. This must therefore be the + * next node to visit. The rest of the stack is already correct. + */ n = i->stack[--sp]; o = n->right; while (o) { @@ -378,19 +715,44 @@ void *treap_next(struct treap_iter *i) return (n); } -static void check_node(struct treap_node *n, unsigned maxwt, - const char *klo, const char *khi) +/* Recursively check the subtree headed by N. + * + * No node should have weight greater than MAXWT, to satisfy the heap + * condition; if LO is not null, then all node keys should be strictly + * greater than LO, and, similarly, if HI is not null, then all keys should + * be strictly smaller than HI. + */ +static void check_subtree(struct treap_node *n, unsigned maxwt, + const char *klo, const char *khi) { + /* Check the heap condition. */ assert(n->wt <= maxwt); + + /* Check that the key is in bounds. (Use `strcmp' here to ensure that our + * own `str_lt' is working correctly.) + */ if (klo) assert(STRCMP(n->k, >, klo)); if (khi) assert(STRCMP(n->k, <, khi)); - if (n->left) check_node(n->left, n->wt, klo, n->k); - if (n->right) check_node(n->right, n->wt, n->k, khi); + + /* Check the left subtree. Node weights must be bounded above by our own + * weight. And every key in the left subtree must be smaller than our + * current key. We propagate the lower bound. + */ + if (n->left) check_subtree(n->left, n->wt, klo, n->k); + + /* Finally, check the right subtree. This time, every key must be larger + * than our key, and we propagate the upper bound. + */ + if (n->right) check_subtree(n->right, n->wt, n->k, khi); } +/* Check the treap structure rules for T. */ void treap_check(struct treap *t) - { if (t->root) check_node(t->root, t->root->wt, 0, 0); } + { if (t->root) check_subtree(t->root, t->root->wt, 0, 0); } +/* Recursively dump the subtree headed by N, indenting the output lines by + * IND spaces. + */ static void dump_node(struct treap_node *n, int ind) { if (n->left) dump_node(n->left, ind + 1); @@ -398,6 +760,7 @@ static void dump_node(struct treap_node *n, int ind) if (n->right) dump_node(n->right, ind + 1); } +/* Dump the treap T to standard output, for debugging purposes. */ void treap_dump(struct treap *t) { if (t->root) dump_node(t->root, 0); } /*----- Configuration file parsing ----------------------------------------*/ @@ -406,13 +769,48 @@ void treap_dump(struct treap *t) { if (t->root) dump_node(t->root, 0); } extern char **environ; #endif +/* Advance P past a syntactically valid name, but no further than L. + * + * Return the new pointer. If no name is found, report an error, blaming + * FILE and LINE; WHAT is an adjective for the kind of name that was + * expected. + */ +static const char *scan_name(const char *what, + const char *p, const char *l, + const char *file, unsigned line) +{ + const char *q = p; + + while (q < l && + (ISALNUM(*q) || *q == '-' || *q == '_' || *q == '.' || *q == '/' || + *q == '*' || *q == '+' || *q == '%' || *q == '@')) + q++; + if (q == p) lose("%s:%u: expected %s name", file, line, what); + return (q); +} + +/* Initialize the configuration state CONF. + * + * Usually you'd use the static initializer `CONFIG_INIT'. + */ void config_init(struct config *conf) { treap_init(&conf->sections); } +/* Find and return the section with null-terminated NAME in CONF. + * + * If no section is found, the behaviour depends on whether `CF_CREAT' is set + * in F: if so, an empty section is created and returned; otherwise, a null + * pointer is returned. + */ struct config_section *config_find_section(struct config *conf, unsigned f, const char *name) { return (config_find_section_n(conf, f, name, strlen(name))); } +/* Find and return the section with the SZ-byte NAME in CONF. + * + * This works like `config_find_section', but with an explicit length for the + * NAME rather than null-termination. + */ struct config_section *config_find_section_n(struct config *conf, unsigned f, const char *name, size_t sz) { @@ -430,18 +828,27 @@ struct config_section *config_find_section_n(struct config *conf, unsigned f, sect->parents = 0; sect->nparents = SIZE_MAX; treap_init(§->vars); treap_init(§->cache); treap_insert(&conf->sections, &path, §->_node, name, sz); - config_set_var_n(conf, sect, CF_LITERAL, "@NAME", 5, name, sz); + config_set_var_n(conf, sect, CF_LITERAL, "@name", 5, name, sz); } } return (sect); } +/* Set the fallback section for CONF to be SECT. + * + * That is, if a section has no explicit parents, then by default it will + * have a single parent which is SECT. If SECT is null then there is no + * fallback section, and sections which don't have explicitly specified + * parents have no parents at all. (This is the default situation.) + */ void config_set_fallback(struct config *conf, struct config_section *sect) -{ - if (sect->nparents == SIZE_MAX) sect->nparents = 0; - conf->fallback = sect; -} + { conf->fallback = sect; } +/* Arrange that SECT has PARENT as its single parent section. + * + * If PARENT is null, then arrange that SECT has no parents at all. In + * either case, any `@parents' setting will be ignored. + */ void config_set_parent(struct config_section *sect, struct config_section *parent) { @@ -453,10 +860,15 @@ void config_set_parent(struct config_section *sect, } } +/* Initialize I to iterate over the sections defined in CONF. */ void config_start_section_iter(struct config *conf, struct config_section_iter *i) { i->sect = conf->head; } +/* Return the next section from I, in order of creation. + * + * If there are no more sections, then return null. + */ struct config_section *config_next_section(struct config_section_iter *i) { struct config_section *sect; @@ -466,49 +878,81 @@ struct config_section *config_next_section(struct config_section_iter *i) return (sect); } +/* Initialize the `parents' links of SECT, if they aren't set up already. + * + * If SECT contains a `@parents' setting then parse it to determine the + * parents; otherwise use CONF's fallbeck section, as established by + * `config_set_fallback'. + */ static void set_config_section_parents(struct config *conf, struct config_section *sect) { struct config_section *parent; struct config_var *var; - struct argv av = ARGV_INIT; + const char *file; unsigned line; size_t i, n; - char *p, *q; + char *p, *q, *l; + struct argv av = ARGV_INIT; + /* If the section already has parents established then there's nothing to + * do. + */ if (sect->nparents != SIZE_MAX) return; - var = treap_lookup(§->vars, "@PARENTS", 8); + /* Look up `@parents', without recursion! */ + var = treap_lookup(§->vars, "@parents", 8); if (!var) { - if (!conf->fallback) + /* No explicit setting: use the fallback setting. */ + + if (!conf->fallback || conf->fallback == sect) sect->nparents = 0; else { - sect->parents = xmalloc(sizeof(*sect->parents)); - sect->nparents = 1; + sect->parents = xmalloc(sizeof(*sect->parents)); sect->nparents = 1; sect->parents[0] = conf->fallback; } } else { - p = var->val; - for (;;) { - while (ISSPACE(*p)) p++; - if (!*p) break; - q = p; while (*q && *q != ',' && !ISSPACE(*q)) q++; - argv_append(&av, p); argv_append(&av, q); - p = q; if (*p == ',') p++; + /* Found a `@parents' list: parse it and set the parents list. */ + + file = var->file; line = var->line; if (!file) file = ""; + + /* We do this in two phases. First, we parse out the section names, and + * record start/limit pointer pairs in `av'. + */ + p = var->val; l = p + var->n; while (p < l && ISSPACE(*p)) p++; + while (*p) { + q = p; + p = (/*unconst*/ char *)scan_name("parent section", p, l, file, line); + argv_append(&av, q); argv_append(&av, p); + while (p < l && ISSPACE(*p)) p++; + if (p >= l) break; + if (*p == ',') do p++; while (ISSPACE(*p)); } + + /* Now that we've finished parsing, we know how many parents we're going + * to have, so we can allocate the `parents' vector and fill it in. + */ sect->nparents = av.n/2; - sect->parents = xmalloc(sect->nparents*sizeof(sect->parents)); + sect->parents = xmalloc(sect->nparents*sizeof(*sect->parents)); for (i = 0; i < av.n; i += 2) { n = av.v[i + 1] - av.v[i]; parent = config_find_section_n(conf, 0, av.v[i], n); if (!parent) lose("%s:%u: unknown parent section `%.*s'", - var->file, var->line, (int)n, av.v[i]); + file, line, (int)n, av.v[i]); sect->parents[i/2] = parent; } - argv_release(&av); } + + /* All done. */ + argv_release(&av); } +/* Find a setting of the SZ-byte variable NAME in CONF, starting from SECT. + * + * If successful, return a pointer to the variable; otherwise return null. + * Inheritance cycles and ambiguous inheritance are diagnosed as fatal + * errors. + */ struct config_var *search_recursive(struct config *conf, struct config_section *sect, const char *name, size_t sz) @@ -518,6 +962,25 @@ struct config_var *search_recursive(struct config *conf, struct config_var *var, *v; size_t i, j = j; + /* If the variable is defined locally then we can just return it. */ + var = treap_lookup(§->vars, name, sz); if (var) return (var); + + /* If we have no parents then there's no way we can find it. */ + set_config_section_parents(conf, sect); + if (!sect->parents) return (0); + + /* Otherwise we must visit the section's parents. We can avoid paying for + * this on every lookup by using a cache. If there's already an entry for + * this variable then we can return the result immediately (note that we + * cache both positive and negative outcomes). Otherwise we create a new + * cache entry, do the full recursive search, and fill in the result when + * we're done. + * + * The cache also helps us detect cycles: we set the `CF_OPEN' flag on a + * new cache entry when it's first created, and clear it when we fill in + * the result: if we encounter an open cache entry again, we know that + * we've found a cycle. + */ cache = treap_probe(§->cache, name, sz, &path); if (!cache) { cache = xmalloc(sizeof(*cache)); cache->f = CF_OPEN; @@ -528,32 +991,51 @@ struct config_var *search_recursive(struct config *conf, else return (cache->var); - set_config_section_parents(conf, sect); - - var = treap_lookup(§->vars, name, sz); - if (!var) { - for (i = 0; i < sect->nparents; i++) { - v = search_recursive(conf, sect->parents[i], name, sz); - if (!v); - else if (!var) { var = v; j = i; } - else if (var != v) - lose("section `%s' inherits variable `%s' ambiguously " - "via `%s' and `%s'", - CONFIG_SECTION_NAME(sect), CONFIG_VAR_NAME(var), - CONFIG_SECTION_NAME(sect->parents[j]), - CONFIG_SECTION_NAME(sect->parents[i])); - } + /* Recursively search in each parent. We insist that all parents that find + * a variable find the same binding; otherwise we declare ambiguous + * inheritance. + */ + for (i = 0; i < sect->nparents; i++) { + v = search_recursive(conf, sect->parents[i], name, sz); + if (!v); + else if (!var) { var = v; j = i; } + else if (var != v) + lose("section `%s' inherits variable `%s' ambiguously " + "via `%s' and `%s'", + CONFIG_SECTION_NAME(sect), CONFIG_VAR_NAME(var), + CONFIG_SECTION_NAME(sect->parents[j]), + CONFIG_SECTION_NAME(sect->parents[i])); } + /* All done: fill the cache entry in, clear the open flag, and return the + * result. + */ cache->var = var; cache->f &= ~CF_OPEN; return (var); } +/* Find and return the variable with null-terminated NAME in SECT. + * + * If `CF_INHERIT' is set in F, then the function searches the section's + * parents recursively; otherwise, it only checks to see whether the variable + * is set directly in SECT. + * + * If no variable is found, the behaviour depends on whether `CF_CREAT' is + * set in F: if so, an empty variable is created and returned; otherwise, a + * null pointer is returned. + * + * Setting both `CF_INHERIT' and `CF_CREAT' is not useful. + */ struct config_var *config_find_var(struct config *conf, struct config_section *sect, unsigned f, const char *name) { return (config_find_var_n(conf, sect, f, name, strlen(name))); } +/* Find and return the variable with the given SZ-byte NAME in SECT. + * + * This works like `config_find_var', but with an explicit length for the + * NAME rather than null-termination. + */ struct config_var *config_find_var_n(struct config *conf, struct config_section *sect, unsigned f, const char *name, size_t sz) @@ -576,44 +1058,74 @@ struct config_var *config_find_var_n(struct config *conf, return (var); } -void config_start_var_iter(struct config_section *sect, - struct config_var_iter *i) - { treap_start_iter(§->vars, &i->i); } - -struct config_var *config_next_var(struct config_var_iter *i) - { return (treap_next(&i->i)); } - -void config_set_var(struct config *conf, struct config_section *sect, - unsigned f, - const char *name, const char *value) +/* Set variable NAME to VALUE in SECT, with associated flags F. + * + * The names are null-terminated. The flags are variable flags: see `struct + * config_var' for details. Returns the variable. + * + * If the variable is already set and has the `CF_OVERRIDE' flag, then this + * function does nothing unless `CF_OVERRIDE' is /also/ set in F. + */ +struct config_var *config_set_var(struct config *conf, + struct config_section *sect, + unsigned f, + const char *name, const char *value) { - config_set_var_n(conf, sect, f, - name, strlen(name), - value, strlen(value)); + return (config_set_var_n(conf, sect, f, + name, strlen(name), + value, strlen(value))); } -void config_set_var_n(struct config *conf, struct config_section *sect, - unsigned f, - const char *name, size_t namelen, - const char *value, size_t valuelen) +/* As `config_set_var', except that the variable NAME and VALUE have explicit + * lengths (NAMELEN and VALUELEN, respectively) rather than being null- + * terminated. + */ +struct config_var *config_set_var_n(struct config *conf, + struct config_section *sect, + unsigned f, + const char *name, size_t namelen, + const char *value, size_t valuelen) { struct config_var *var = config_find_var_n(conf, sect, CF_CREAT, name, namelen); - if (var->f&~f&CF_OVERRIDE) return; + if (var->f&~f&CF_OVERRIDE) return (var); free(var->val); var->val = xstrndup(value, valuelen); var->n = valuelen; var->f = f; + return (var); } +/* Initialize I to iterate over the variables directly defined in SECT. */ +void config_start_var_iter(struct config *conf, struct config_section *sect, + struct config_var_iter *i) + { treap_start_iter(§->vars, &i->i); } + +/* Return next variable from I, in ascending lexicographical order. + * + * If there are no more variables, then return null. + */ +struct config_var *config_next_var(struct config_var_iter *i) + { return (treap_next(&i->i)); } + +/* Read and parse configuration FILE, applying its settings to CONF. + * + * If all goes well, the function returns 0. If the file is not found, then + * the behaviour depends on whether `CF_NOENTOK' is set in F: if so, then the + * function simply returns -1. Otherwise, a fatal error is reported. Note + * that this /only/ applies if the file does not exist (specifically, opening + * it fails with `ENOENT') -- any other problems are reported as fatal + * errors regardless of the flag setting. + */ int config_read_file(struct config *conf, const char *file, unsigned f) { struct config_section *sect; struct config_var *var; struct dstr d = DSTR_INIT, dd = DSTR_INIT; unsigned line = 0; - char *p, *q; + const char *p, *q, *r; FILE *fp; + /* Try to open the file. */ fp = fopen(file, "r"); if (!fp) { if ((f&CF_NOENTOK) && errno == ENOENT) return (-1); @@ -621,61 +1133,107 @@ int config_read_file(struct config *conf, const char *file, unsigned f) file, strerror(errno)); } + /* Find the initial section. */ sect = config_find_section(conf, CF_CREAT, "@CONFIG"); var = 0; + /* Work through the file, line by line. */ for (;;) { dstr_reset(&d); if (dstr_readline(&d, fp)) break; line++; - if (d.p[0] && !ISSPACE(d.p[0])) { + /* Trim trailing spaces from the line. The syntax is sensitive to + * leading spaces, so we can't trim those yet. + */ + while (d.len && ISSPACE(d.p[d.len - 1])) d.len--; + d.p[d.len] = 0; + + if (!*d.p || *d.p == ';') + /* Ignore comments entirely. (In particular, a comment doesn't + * interrupt a multiline variable value.) + */ + ; + + else if (ISSPACE(d.p[0])) { + /* The line starts with whitespace, so it's a continuation line. */ + + /* Skip the initial whitespace. */ + p = d.p; while (ISSPACE(*p)) p++; + + /* If we aren't collecting a variable value then this is an error. + * Otherwise, accumulate it into the current value. + */ + if (!var) + lose("%s:%u: continuation line, but no variable", file, line); + if (dd.len) dstr_putc(&dd, ' '); + dstr_putm(&dd, p, d.len - (p - d.p)); + + } else { + /* The line starts in the first column. */ + + /* If there's a value value being collected then we must commit it to + * its variable (unless there's already a setting there that says we + * shouldn't). + */ if (var) { if (!(var->f&CF_OVERRIDE)) { var->val = xstrndup(dd.p, dd.len); var->n = dd.len; } var = 0; } - if (d.p[0] == ';') - ; - else if (d.p[0] == '[') { - p = d.p + 1; q = strchr(p, ']'); - if (!q) lose("%s:%u: missing `]' in section header", file, line); + + /* Now decide what kind of line this is. */ + if (d.p[0] == '[') { + /* It's a section header. */ + + /* Parse the header. */ + p = d.p + 1; while (ISSPACE(*p)) p++; + q = scan_name("section", p, d.p + d.len, file, line); + r = q; while (ISSPACE(*r)) r++; + if (*r != ']') + lose("%s:%u: expected `]' in section header", file, line); + if (r[1]) + lose("%s:%u: trailing junk after `]' in section header", + file, line); + + /* Create the new section. */ sect = config_find_section_n(conf, CF_CREAT, p, q - p); - q++; while (ISSPACE(*q)) q++; - if (*q) lose("%s:%u: trailing junk after `]' in section header", - file, line); + } else { - p = d.p; - while (*p && !ISSPACE(*p) && *p != '{' && *p != '}' && *p != '=') - p++; + /* It's a variable assignment. Parse the name out. */ + p = scan_name("variable", d.p, d.p + d.len, file, line); var = config_find_var_n(conf, sect, CF_CREAT, d.p, p - d.p); while (ISSPACE(*p)) p++; if (*p != '=') lose("%s:%u: missing `=' in assignment", file, line); p++; while (ISSPACE(*p)) p++; + + /* Clear out the variable's initial value, unless we shouldn't + * override it. + */ if (!(var->f&CF_OVERRIDE)) { free(var->val); var->val = 0; var->f = 0; free(var->file); var->file = xstrdup(file); var->line = line; } dstr_reset(&dd); dstr_puts(&dd, p); } - } else { - p = d.p; while (ISSPACE(*p)) p++; - if (*p) { - if (!var) - lose("%s:%u: continuation line, but no variable", file, line); - if (dd.len) dstr_putc(&dd, ' '); - dstr_puts(&dd, p); - } } } + /* If there's a value under construction then commit the result. */ if (var && !(var->f&CF_OVERRIDE)) { var->val = xstrndup(dd.p, dd.len); var->n = dd.len; } - dstr_release(&d); dstr_release(&dd); + /* Close the file. */ if (fclose(fp)) lose("error reading configuration file `%s': %s", file, strerror(errno)); + + /* All done. */ + dstr_release(&d); dstr_release(&dd); return (0); } +/* Populate SECT with environment variables. + * + * Environment variables are always set with `CF_LITERAL'. + */ void config_read_env(struct config *conf, struct config_section *sect) { const char *p, *v; @@ -689,50 +1247,113 @@ void config_read_env(struct config *conf, struct config_section *sect) /*----- Substitution and quoting ------------------------------------------*/ +/* The substitution and word-splitting state. + * + * This only keeps track of the immutable parameters for the substitution + * task: stuff which changes (flags, filtering state, cursor position) is + * maintained separately. + */ struct subst { - struct config *config; - struct config_section *home, *fallback; - struct argv *av; - struct dstr *d; + struct config *config; /* configuration state */ + struct config_section *home; /* home section for lookups */ + struct dstr *d; /* current word being constructed */ + struct argv *av; /* output word list */ }; -static const char *scan_name(const char *p, const char *l) -{ - while (p < l && - (ISALNUM(*p) || *p == '-' || *p == '_' || *p == '.' || *p == '/' || - *p == '*' || *p == '+' || *p == '%' || *p == '@')) - p++; - return (p); -} - -static void filter_string(const char *p, const char *l, struct subst *sb, - unsigned qfilt) +/* Flags for `subst' and related functions. */ +#define SF_SPLIT 0x0001u /* split at (unquoted) whitespace */ +#define SF_QUOT 0x0002u /* currently within double quotes */ +#define SF_SUBST 0x0004u /* apply `$-substitutions */ +#define SF_SUBEXPR 0x0008u /* stop at delimiter `|' or `}' */ +#define SF_SPANMASK 0x00ffu /* mask for the above */ + +#define SF_WORD 0x0100u /* output word under construction */ +#define SF_SKIP 0x0200u /* not producing output */ +#define SF_LITERAL 0x0400u /* do not expand or substitute */ +#define SF_UPCASE 0x0800u /* convert to uppercase */ +#define SF_DOWNCASE 0x1000u /* convert to lowercase */ +#define SF_CASEMASK 0x1800u /* mask for case conversions */ + +/* Apply filters encoded in QFILT and F to the text from P to L, and output. + * + * SB is the substitution state which, in particular, explains where the + * output should go. + * + * The filters are encoded as flags `SF_UPCASE' and `SF_DOWNCASE' for case + * conversions, and a nesting depth QFILT for toothpick escaping. (QFILT is + * encoded as the number of toothpicks to print: see `subst' for how this + * determined.) + */ +static void filter_string(const char *p, const char *l, + const struct subst *sb, unsigned qfilt, unsigned f) { size_t r, n; + char *q; const char *pp, *ll; - if (!qfilt) + if (!qfilt && !(f&SF_CASEMASK)) + /* Fast path: there's nothing to do: just write to the output. */ dstr_putm(sb->d, p, l - p); + else for (;;) { - r = l - p; n = strcspn(p, "\"\\"); + /* We must be a bit more circumspect. */ + + /* Determine the length of the next span of characters which don't need + * escaping. (If QFILT is zero then this is everything.) + */ + r = l - p; n = qfilt ? strcspn(p, "\"\\") : r; if (n > r) n = r; - dstr_putm(sb->d, p, n); + + if (!(f&SF_CASEMASK)) + /* No case conversion: we can just emit this chunk. */ + + dstr_putm(sb->d, p, n); + + else { + /* Case conversion to do. Arrange enough space for the output, and + * convert it character by character. + */ + + dstr_ensure(sb->d, n); q = sb->d->p + sb->d->len; pp = p; ll = p + n; + if (f&SF_DOWNCASE) while (pp < ll) *q++ = TOLOWER(*pp++); + else if (f&SF_UPCASE) while (pp < ll) *q++ = TOUPPER(*pp++); + sb->d->len += n; + } + + /* If we've reached the end then stop. */ if (n >= r) break; - dstr_putcn(sb->d, '\\', qfilt); dstr_putc(sb->d, p[n]); - p += n + 1; + + /* Otherwise we must have found a character which requires escaping. + * Emit enough toothpicks. + */ + dstr_putcn(sb->d, '\\', qfilt); + + /* This character is now done, so we can skip over and see if there's + * another chunk of stuff we can do at high speed. + */ + dstr_putc(sb->d, p[n]); p += n + 1; } } +/* Scan and resolve a `[SECT:]VAR' specifier at P. + * + * Return the address of the next character following the specifier; and set + * *VAR_OUT to point to the variable we found, or null if it's not there. L + * is a limit on the region of the buffer that we should process; SB is the + * substitution state which provides the home section if none is given + * explicitly; FILE and LINE are the source location to blame for problems. + */ static const char *retrieve_varspec(const char *p, const char *l, - struct subst *sb, - struct config_var **var_out) + const struct subst *sb, + struct config_var **var_out, + const char *file, unsigned line) { struct config_section *sect = sb->home; const char *t; - t = scan_name(p, l); + t = scan_name("section or variable", p, l, file, line); if (t < l && *t == ':') { sect = config_find_section_n(sb->config, 0, p, t - p); - p = t + 1; t = scan_name(p, l); + p = t + 1; t = scan_name("variable", p, l, file, line); } if (!sect) *var_out = 0; @@ -740,16 +1361,16 @@ static const char *retrieve_varspec(const char *p, const char *l, return (t); } -#define SF_SPLIT 0x0001u -#define SF_QUOT 0x0002u -#define SF_SUBST 0x0004u -#define SF_SUBEXPR 0x0008u -#define SF_SPANMASK 0x00ffu -#define SF_WORD 0x0100u -#define SF_SKIP 0x0200u -#define SF_LITERAL 0x0400u - -static const char *subst(const char *p, const char *l, struct subst *sb, +/* Substitute and/or word-split text. + * + * The input text starts at P, and continues to (just before) L. Context for + * the task is provided by SB; the source location to blame is FILE and LINE + * (FILE may be null so that this can be passed directly from a `config_var' + * without further checking); QFILT is the nesting depth in toothpick- + * escaping; and F holds a mask of `SF_...' flags. + */ +static const char *subst(const char *p, const char *l, + const struct subst *sb, const char *file, unsigned line, unsigned qfilt, unsigned f) { @@ -758,46 +1379,67 @@ static const char *subst(const char *p, const char *l, struct subst *sb, unsigned subqfilt, ff; size_t n; -#define ESCAPE "\\" -#define SUBST "$" -#define WORDSEP " \f\r\n\t\v'\"" -#define QUOT "\"" -#define DELIM "|}" - - static const char *const delimtab[] = - { ESCAPE, - ESCAPE WORDSEP, - 0, - ESCAPE QUOT, - ESCAPE SUBST, - ESCAPE SUBST WORDSEP, - 0, - ESCAPE SUBST QUOT, - ESCAPE DELIM, - ESCAPE DELIM WORDSEP, - 0, - ESCAPE DELIM QUOT, - ESCAPE DELIM SUBST, - ESCAPE DELIM SUBST WORDSEP, - 0, - ESCAPE DELIM SUBST QUOT }; + /* It would be best if we could process literal text at high speed. To + * this end, we have a table, indexed by the low-order bits of F, to tell + * us which special characters we need to stop at. This way, we can use + * `strcspn' to skip over literal text and stop at the next character which + * needs special handling. Entries in this table with a null pointer + * correspond to impossible flag settings. + */ + static const char *const delimtab[] = { + +#define ESCAPE "\\" /* always watch for `\'-escapes */ +#define SUBST "$" /* check for `$' if `SF_SUBST' set */ +#define WORDSEP " \f\r\n\t\v'\"" /* space, quotes if `SF_SPLIT' but + * not `SF_QUOT' */ +#define QUOT "\"" /* only quotes if `SF_SPLIT' and + * `SF_QUOT' */ +#define DELIM "|}" /* end delimiters of `SF_SUBEXPR' */ + + ESCAPE, + ESCAPE WORDSEP, + 0, + ESCAPE QUOT, + ESCAPE SUBST, + ESCAPE SUBST WORDSEP, + 0, + ESCAPE SUBST QUOT, + ESCAPE DELIM, + ESCAPE DELIM WORDSEP, + 0, + ESCAPE DELIM QUOT, + ESCAPE DELIM SUBST, + ESCAPE DELIM SUBST WORDSEP, + 0, + ESCAPE DELIM SUBST QUOT #undef COMMON #undef WORDSEP #undef SQUOT #undef DELIM + }; + /* Set FILE to be useful if it was null on entry. */ if (!file) file = ""; + /* If the text is literal then hand off to `filter_string'. This obviously + * starts a word. + */ if (f&SF_LITERAL) { - filter_string(p, l, sb, qfilt); + filter_string(p, l, sb, qfilt, f); f |= SF_WORD; goto done; } + /* Chew through the input until it's all gone. */ while (p < l) { if ((f&(SF_SPLIT | SF_QUOT)) == SF_SPLIT && ISSPACE(*p)) { + /* This is whitespace, we're supposed to split, and we're not within + * quotes, so we should split here. + */ + + /* If there's a word in progress then we should commit it. */ if (f&SF_WORD) { if (!(f&SF_SKIP)) { argv_append(sb->av, xstrndup(sb->d->p, sb->d->len)); @@ -805,60 +1447,182 @@ static const char *subst(const char *p, const char *l, struct subst *sb, } f &= ~SF_WORD; } + + /* Skip over further whitespace at high speed. */ do p++; while (p < l && ISSPACE(*p)); } else if (*p == '\\') { - p++; - if (p >= l) lose("%s:%u: unfinished `\\' escape", file, line); + /* This is a toothpick, so start a new word and add the next character + * to it. + */ + + /* If there's no next character then we should be upset. */ + p++; if (p >= l) lose("%s:%u: unfinished `\\' escape", file, line); + if (!(f&SF_SKIP)) { + + /* If this is a double quote or backslash then check DFLT to see if + * it needs escaping. + */ if (qfilt && (*p == '"' || *p == '\\')) dstr_putcn(sb->d, '\\', qfilt); - dstr_putc(sb->d, *p); + + /* Output the character. */ + if (f&SF_DOWNCASE) dstr_putc(sb->d, TOLOWER(*p)); + else if (f&SF_UPCASE) dstr_putc(sb->d, TOUPPER(*p)); + else dstr_putc(sb->d, *p); } - p++; + + /* Move past the escaped character. Remember we started a word. */ + p++; f |= SF_WORD; } else if ((f&SF_SPLIT) && *p == '"') { + /* This is a double quote, and we're word splitting. We're definitely + * in a word now. Toggle whether we're within quotes. + */ + f ^= SF_QUOT; f |= SF_WORD; p++; } else if ((f&(SF_SPLIT | SF_QUOT)) == SF_SPLIT && *p == '\'') { - t = strchr(p, '\''); if (!t) lose("%s:%u: missing `''", file, line); - if (!(f&SF_SKIP)) filter_string(p, t, sb, qfilt); + /* This is a single quote, and we're word splitting but not within + * double quotes. Find the matching end quote, and just output + * everything between literally. + */ + + p++; t = strchr(p, '\''); + if (!t || t >= l) lose("%s:%u: missing `''", file, line); + if (!(f&SF_SKIP)) filter_string(p, t, sb, qfilt, f); p = t + 1; f |= SF_WORD; } else if ((f&SF_SUBEXPR) && (*p == '|' || *p == '}')) { + /* This is an end delimiter, and we're supposed to stop here. */ break; } else if ((f&SF_SUBST) && *p == '$') { + /* This is a `$' and we're supposed to do substitution. */ + + /* The kind of substitution is determined by the next character. */ p++; if (p >= l) lose("%s:%u: incomplete substitution", file, line); + + /* Prepare flags for a recursive substitution. + * + * Hide our quote state from the recursive call. If we're within a + * word, then disable word-splitting. + */ ff = f&~(SF_QUOT | (f&SF_WORD ? SF_SPLIT : 0)); + + /* Now dispatch based on the following character. */ switch (*p) { case '?': - p = retrieve_varspec(p + 1, l, sb, &var); + /* A conditional expression: $?VAR{CONSEQ[|ALT]} */ + + /* Skip initial space. */ + p++; while (p < l && ISSPACE(*p)) p++; + + /* Find the variable. */ + p = retrieve_varspec(p, l, sb, &var, file, line); + + /* Skip whitespace again. */ + while (p < l && ISSPACE(*p)) p++; + + /* Expect the opening `{'. */ if (p > l || *p != '{') lose("%s:%u: expected `{'", file, line); p++; + + /* We'll process the parts recursively, but we need to come back + * when we hit the appropriate delimiters, so arrange for that. + */ ff |= SF_SUBEXPR; + + /* Process the consequent (skip if the variable wasn't found). */ p = subst(p, l, sb, file, line, qfilt, ff | (var ? 0 : SF_SKIP)); + + /* If there's a `|' then process the alternative too (skip if the + * variable /was/ found). + */ if (p < l && *p == '|') p = subst(p + 1, l, sb, file, line, qfilt, ff | (var ? SF_SKIP : 0)); + + /* We should now be past the closing `}'. */ if (p >= l || *p != '}') lose("%s:%u: missing `}'", file, line); p++; break; case '{': - q0 = p + 1; p = retrieve_varspec(q0, l, sb, &var); q1 = p; + /* A variable substitution: ${VAR[|FILT]...[?ALT]} */ + + /* Skip initial whitespace. */ + p++; while (p < l && ISSPACE(*p)) p++; + + /* Find the variable. */ + q0 = p; p = retrieve_varspec(p, l, sb, &var, file, line); q1 = p; + + /* Determine the filters to apply when substituting the variable + * value. + */ subqfilt = qfilt; - while (p < l) { - if (*p != '|') break; - p++; t = scan_name(p, l); - if (t - p == 1 && *p == 'q') subqfilt = 2*subqfilt + 1; + for (;;) { + + /* Skip spaces again. */ + while (p < l && ISSPACE(*p)) p++; + + /* If there's no `|' then there are no more filters, so stop. */ + if (p >= l || *p != '|') break; + + /* Skip the `|' and more spaces. */ + p++; while (p < l && ISSPACE(*p)) p++; + + /* Collect the filter name. */ + t = scan_name("filter", p, l, file, line); + + /* Dispatch on the filter name. */ + if (t - p == 1 && *p == 'q') + /* `q' -- quote for Lisp string. + * + * We're currently adding Q `\' characters before each naughty + * character. But a backslash itself is naughty too, so that + * makes Q + 1 naughty characters, each of which needs a + * toothpick, so now we need Q + (Q + 1) = 2 Q + 1 toothpicks. + * + * Calculate this here rather than at each point toothpicks + * need to be deployed. + */ + + subqfilt = 2*subqfilt + 1; + + else if (t - p == 1 && *p == 'l') + /* `l' -- convert to lowercase. + * + * If a case conversion is already set, then that will override + * whatever we do here, so don't bother. + */ + + { if (!(ff&SF_CASEMASK)) ff |= SF_DOWNCASE; } + + else if (t - p == 1 && *p == 'u') + /* `u' -- convert to uppercase. + * + * If a case conversion is already set, then that will override + * whatever we do here, so don't bother. + */ + { if (!(ff&SF_CASEMASK)) ff |= SF_UPCASE; } + else + /* Something else we didn't understand. */ lose("%s:%u: unknown filter `%.*s'", file, line, (int)(t - p), p); + + /* Continue from after the filter name. */ p = t; } + + /* If we're not skipping, and we found a variable, then substitute + * its value. This is the point where we need to be careful about + * recursive expansion. + */ if (!(f&SF_SKIP) && var) { if (var->f&CF_EXPAND) lose("%s:%u: recursive expansion of variable `%.*s'", @@ -869,19 +1633,35 @@ static const char *subst(const char *p, const char *l, struct subst *sb, ff | (var->f&CF_LITERAL ? SF_LITERAL : 0)); var->f &= ~CF_EXPAND; } + + /* If there's an alternative, then we need to process (or maybe + * skip) it. Otherwise, we should complain if there was no + * veriable, and we're not skipping. + */ if (p < l && *p == '?') p = subst(p + 1, l, sb, file, line, subqfilt, ff | SF_SUBEXPR | (var ? SF_SKIP : 0)); else if (!var && !(f&SF_SKIP)) lose("%s:%u: unknown variable `%.*s'", file, line, (int)(q1 - q0), q0); + + /* Expect a `}' here. (No need to skip spaces: we already did that + * after scanning for filters, and either there was no alternative, + * or we advanced to a delimiter character anyway.) + */ if (p >= l || *p != '}') lose("%s:%u: missing `}'", file, line); p++; break; default: - lose("%s:%u: unexpected substitution `%c'", file, line, *p); + /* Something else. That's a shame. */ + lose("%s:%u: unexpected `$'-substitution `%c'", file, line, *p); } + + /* Complain if we started out in word-splitting state, and therefore + * have added a whole number of words to the output, but there's a + * word-fragment stuck onto the end of this substitution. + */ if (p < l && !(~f&~(SF_WORD | SF_SPLIT)) && !ISSPACE(*p) && !((f&SF_SUBEXPR) && (*p == '|' || *p == '}'))) lose("%s:%u: surprising word boundary " @@ -890,23 +1670,42 @@ static const char *subst(const char *p, const char *l, struct subst *sb, } else { + /* Something else. Try to skip over this at high speed. + * + * This makes use of the table we set up earlier. + */ + n = strcspn(p, delimtab[f&SF_SPANMASK]); if (n > l - p) n = l - p; - if (!(f&SF_SKIP)) filter_string(p, p + n, sb, qfilt); + if (!(f&SF_SKIP)) filter_string(p, p + n, sb, qfilt, f); p += n; f |= SF_WORD; } } done: + /* Sort out the wreckage. */ + + /* If we're still within quotes then something has gone wrong. */ if (f&SF_QUOT) lose("%s:%u: missing `\"'", file, line); + + /* If we're within a word, and should be splitting, then commit the word to + * the output list. + */ if ((f&(SF_WORD | SF_SPLIT | SF_SKIP)) == (SF_SPLIT | SF_WORD)) { argv_append(sb->av, xstrndup(sb->d->p, sb->d->len)); dstr_reset(sb->d); } + /* And, with that, we're done. */ return (p); } +/* Expand substitutions in a string. + * + * Expand the null-terminated string P relative to the HOME section, using + * configuration CONFIG, and appending the result to dynamic string D. Blame + * WHAT in any error messages. + */ void config_subst_string(struct config *config, struct config_section *home, const char *what, const char *p, struct dstr *d) { @@ -917,6 +1716,12 @@ void config_subst_string(struct config *config, struct config_section *home, dstr_putz(d); } +/* Expand substitutions in a string. + * + * Expand the null-terminated string P relative to the HOME section, using + * configuration CONFIG, returning the result as a freshly malloc(3)ed + * string. Blame WHAT in any error messages. + */ char *config_subst_string_alloc(struct config *config, struct config_section *home, const char *what, const char *p) @@ -928,6 +1733,11 @@ char *config_subst_string_alloc(struct config *config, q = xstrndup(d.p, d.len); dstr_release(&d); return (q); } +/* Expand substitutions in a variable. + * + * Expand the value of the variable VAR relative to the HOME section, using + * configuration CONFIG, appending the result to dynamic string D. + */ void config_subst_var(struct config *config, struct config_section *home, struct config_var *var, struct dstr *d) { @@ -941,6 +1751,12 @@ void config_subst_var(struct config *config, struct config_section *home, dstr_putz(d); } +/* Expand substitutions in a variable. + * + * Expand the value of the variable VAR relative to the HOME section, using + * configuration CONFIG, returning the result as a freshly malloc(3)ed + * string. + */ char *config_subst_var_alloc(struct config *config, struct config_section *home, struct config_var *var) @@ -952,6 +1768,12 @@ char *config_subst_var_alloc(struct config *config, q = xstrndup(d.p, d.len); dstr_release(&d); return (q); } +/* Expand substitutions in a variable and split into words. + * + * Expand and word-split the value of the variable VAR relative to the HOME + * section, using configuration CONFIG, appending the resulting words into + * the vector AV. + */ void config_subst_split_var(struct config *config, struct config_section *home, struct config_var *var, struct argv *av)