Implemented a simple wildcard matching engine, and used it to

[u/mdw/putty] / wildcard.c
diff --git a/wildcard.c b/wildcard.c

new file mode 100644 (file)

index 0000000..999dfde
--- /dev/null
+++ b/wildcard.c
@@ -0,0 +1,459 @@
+/*
+ * Wildcard matching engine for use with SFTP-based file transfer
+ * programs (PSFTP, new-look PSCP): since SFTP has no notion of
+ * getting the remote side to do globbing (and rightly so) we have
+ * to do it locally, by retrieving all the filenames in a directory
+ * and checking each against the wildcard pattern.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Definition of wildcard syntax:
+ * 
+ *  - * matches any sequence of characters, including zero.
+ *  - ? matches exactly one character which can be anything.
+ *  - [abc] matches exactly one character which is a, b or c.
+ *  - [a-f] matches anything from a through f.
+ *  - [^a-f] matches anything _except_ a through f.
+ *  - [-_] matches - or _; [^-_] matches anything else. (The - is
+ *    non-special if it occurs immediately after the opening
+ *    bracket or ^.)
+ *  - [a^] matches an a or a ^. (The ^ is non-special if it does
+ *    _not_ occur immediately after the opening bracket.)
+ *  - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \.
+ *  - All other characters are non-special and match themselves.
+ */
+
+/*
+ * The wildcard matching technique we use is very simple and
+ * potentially O(N^2) in running time, but I don't anticipate it
+ * being that bad in reality (particularly since N will be the size
+ * of a filename, which isn't all that much). Perhaps one day, once
+ * PuTTY has grown a regexp matcher for some other reason, I might
+ * come back and reimplement wildcards by translating them into
+ * regexps or directly into NFAs; but for the moment, in the
+ * absence of any other need for the NFA->DFA translation engine,
+ * anything more than the simplest possible wildcard matcher is
+ * vast code-size overkill.
+ * 
+ * Essentially, these wildcards are much simpler than regexps in
+ * that they consist of a sequence of rigid fragments (? and [...]
+ * can never match more or less than one character) separated by
+ * asterisks. It is therefore extremely simple to look at a rigid
+ * fragment and determine whether or not it begins at a particular
+ * point in the test string; so we can search along the string
+ * until we find each fragment, then search for the next. As long
+ * as we find each fragment in the _first_ place it occurs, there
+ * will never be a danger of having to backpedal and try to find it
+ * again somewhere else.
+ */
+
+enum {
+    WC_TRAILINGBACKSLASH = 1,
+    WC_UNCLOSEDCLASS,
+    WC_INVALIDRANGE,
+};
+
+/*
+ * Error reporting is done by returning various negative values
+ * from the wildcard routines. Passing any such value to wc_error
+ * will give a human-readable message.
+ */
+const char *wc_error(int value)
+{
+    value = abs(value);
+    switch (value) {
+      case WC_TRAILINGBACKSLASH:
+       return "'\' occurred at end of string (expected another character)";
+      case WC_UNCLOSEDCLASS:
+       return "expected ']' to close character class";
+      case WC_INVALIDRANGE:
+       return "character range was not terminated (']' just after '-')";
+    }
+    return "INTERNAL ERROR: unrecognised wildcard error number";
+}
+
+/*
+ * This is the routine that tests a target string to see if an
+ * initial substring of it matches a fragment. If successful, it
+ * returns 1, and advances both `fragment' and `target' past the
+ * fragment and matching substring respectively. If unsuccessful it
+ * returns zero. If the wildcard fragment suffers a syntax error,
+ * it returns <0 and the precise value indexes into wc_error.
+ */
+static int wc_match_fragment(const char **fragment, const char **target)
+{
+    const char *f, *t;
+
+    f = *fragment;
+    t = *target;
+    /*
+     * The fragment terminates at either the end of the string, or
+     * the first (unescaped) *.
+     */
+    while (*f && *f != '*' && *t) {
+       /*
+        * Extract one character from t, and one character's worth
+        * of pattern from f, and step along both. Return 0 if they
+        * fail to match.
+        */
+       if (*f == '\\') {
+           /*
+            * Backslash, which means f[1] is to be treated as a
+            * literal character no matter what it is. It may not
+            * be the end of the string.
+            */
+           if (!f[1])
+               return -WC_TRAILINGBACKSLASH;   /* error */
+           if (f[1] != *t)
+               return 0;              /* failed to match */
+           f += 2;
+       } else if (*f == '?') {
+           /*
+            * Question mark matches anything.
+            */
+           f++;
+       } else if (*f == '[') {
+           int invert = 0;
+           int matched = 0;
+           /*
+            * Open bracket introduces a character class.
+            */
+           f++;
+           if (*f == '^') {
+               invert = 1;
+               f++;
+           }
+           while (*f != ']') {
+               if (*f == '\\')
+                   f++;               /* backslashes still work */
+               if (!*f)
+                   return -WC_UNCLOSEDCLASS;   /* error again */
+               if (f[1] == '-') {
+                   int lower, upper, ourchr;
+                   lower = (unsigned char) *f++;
+                   f++;               /* eat the minus */
+                   if (*f == ']')
+                       return -WC_INVALIDRANGE;   /* different error! */
+                   if (*f == '\\')
+                       f++;           /* backslashes _still_ work */
+                   if (!*f)
+                       return -WC_UNCLOSEDCLASS;   /* error again */
+                   upper = (unsigned char) *f++;
+                   ourchr = (unsigned char) *t;
+                   if (lower > upper) {
+                       int t = lower; lower = upper; upper = t;
+                   }
+                   if (ourchr >= lower && ourchr <= upper)
+                       matched = 1;
+               } else {
+                   matched |= (*t == *f++);
+               }
+           }
+           if (invert == matched)
+               return 0;              /* failed to match character class */
+           f++;                       /* eat the ] */
+       } else {
+           /*
+            * Non-special character matches itself.
+            */
+           if (*f != *t)
+               return 0;
+           f++;
+       }
+       /*
+        * Now we've done that, increment t past the character we
+        * matched.
+        */
+       t++;
+    }
+    if (!*f || *f == '*') {
+       /*
+        * We have reached the end of f without finding a mismatch;
+        * so we're done. Update the caller pointers and return 1.
+        */
+       *fragment = f;
+       *target = t;
+       return 1;
+    }
+    /*
+     * Otherwise, we must have reached the end of t before we
+     * reached the end of f; so we've failed. Return 0. 
+     */
+    return 0;
+}
+
+/*
+ * This is the real wildcard matching routine. It returns 1 for a
+ * successful match, 0 for an unsuccessful match, and <0 for a
+ * syntax error in the wildcard.
+ */
+int wc_match(const char *wildcard, const char *target)
+{
+    int ret;
+
+    /*
+     * Every time we see a '*' _followed_ by a fragment, we just
+     * search along the string for a location at which the fragment
+     * matches. The only special case is when we see a fragment
+     * right at the start, in which case we just call the matching
+     * routine once and give up if it fails.
+     */
+    if (*wildcard != '*') {
+       ret = wc_match_fragment(&wildcard, &target);
+       if (ret <= 0)
+           return ret;                /* pass back failure or error alike */
+    }
+
+    while (*wildcard) {
+       assert(*wildcard == '*');
+       while (*wildcard == '*')
+           wildcard++;
+
+       /*
+        * It's possible we've just hit the end of the wildcard
+        * after seeing a *, in which case there's no need to
+        * bother searching any more because we've won.
+        */
+       if (!*wildcard)
+           return 1;
+
+       /*
+        * Now `wildcard' points at the next fragment. So we
+        * attempt to match it against `target', and if that fails
+        * we increment `target' and try again, and so on. When we
+        * find we're about to try matching against the empty
+        * string, we give up and return 0.
+        */
+       ret = 0;
+       while (*target) {
+           const char *save_w = wildcard, *save_t = target;
+
+           ret = wc_match_fragment(&wildcard, &target);
+
+           if (ret < 0)
+               return ret;            /* syntax error */
+
+           if (ret > 0 && !*wildcard && *target) {
+               /*
+                * Final special case - literally.
+                * 
+                * This situation arises when we are matching a
+                * _terminal_ fragment of the wildcard (that is,
+                * there is nothing after it, e.g. "*a"), and it
+                * has matched _too early_. For example, matching
+                * "*a" against "parka" will match the "a" fragment
+                * against the _first_ a, and then (if it weren't
+                * for this special case) matching would fail
+                * because we're at the end of the wildcard but not
+                * at the end of the target string.
+                * 
+                * In this case what we must do is measure the
+                * length of the fragment in the target (which is
+                * why we saved `target'), jump straight to that
+                * distance from the end of the string using
+                * strlen, and match the same fragment again there
+                * (which is why we saved `wildcard'). Then we
+                * return whatever that operation returns.
+                */
+               target = save_t + strlen(save_t) - (target - save_t);
+               wildcard = save_w;
+               return wc_match_fragment(&wildcard, &target);
+           }
+
+           if (ret > 0)
+               break;
+           target++;
+       }
+       if (ret > 0)
+           continue;
+       return 0;
+    }
+
+    /*
+     * If we reach here, it must be because we successfully matched
+     * a fragment and then found ourselves right at the end of the
+     * wildcard. Hence, we return 1 if and only if we are also
+     * right at the end of the target.
+     */
+    return (*target ? 0 : 1);
+}
+
+/*
+ * Another utility routine that translates a non-wildcard string
+ * into its raw equivalent by removing any escaping backslashes.
+ * Expects a target string buffer of anything up to the length of
+ * the original wildcard. You can also pass NULL as the output
+ * buffer if you're only interested in the return value.
+ * 
+ * Returns 1 on success, or 0 if a wildcard character was
+ * encountered. In the latter case the output string MAY not be
+ * zero-terminated and you should not use it for anything!
+ */
+int wc_unescape(char *output, const char *wildcard)
+{
+    while (*wildcard) {
+       if (*wildcard == '\\') {
+           wildcard++;
+           /* We are lenient about trailing backslashes in non-wildcards. */
+           if (*wildcard) {
+               if (output)
+                   *output++ = *wildcard;
+               wildcard++;
+           }
+       } else if (*wildcard == '*' || *wildcard == '?' ||
+                  *wildcard == '[' || *wildcard == ']') {
+           return 0;                  /* it's a wildcard! */
+       } else {
+           if (output)
+               *output++ = *wildcard;
+           wildcard++;
+       }
+    }
+    *output = '\0';
+    return 1;                         /* it's clean */
+}
+
+#ifdef TESTMODE
+
+struct test {
+    const char *wildcard;
+    const char *target;
+    int expected_result;
+};
+
+const struct test fragment_tests[] = {
+    /*
+     * We exhaustively unit-test the fragment matching routine
+     * itself, which should save us the need to test all its
+     * intricacies during the full wildcard tests.
+     */
+    {"abc", "abc", 1},
+    {"abc", "abd", 0},
+    {"abc", "abcd", 1},
+    {"abcd", "abc", 0},
+    {"ab[cd]", "abc", 1},
+    {"ab[cd]", "abd", 1},
+    {"ab[cd]", "abe", 0},
+    {"ab[^cd]", "abc", 0},
+    {"ab[^cd]", "abd", 0},
+    {"ab[^cd]", "abe", 1},
+    {"ab\\", "abc", -WC_TRAILINGBACKSLASH},
+    {"ab\\*", "ab*", 1},
+    {"ab\\?", "ab*", 0},
+    {"ab?", "abc", 1},
+    {"ab?", "ab", 0},
+    {"ab[", "abc", -WC_UNCLOSEDCLASS},
+    {"ab[c-", "abb", -WC_UNCLOSEDCLASS},
+    {"ab[c-]", "abb", -WC_INVALIDRANGE},
+    {"ab[c-e]", "abb", 0},
+    {"ab[c-e]", "abc", 1},
+    {"ab[c-e]", "abd", 1},
+    {"ab[c-e]", "abe", 1},
+    {"ab[c-e]", "abf", 0},
+    {"ab[e-c]", "abb", 0},
+    {"ab[e-c]", "abc", 1},
+    {"ab[e-c]", "abd", 1},
+    {"ab[e-c]", "abe", 1},
+    {"ab[e-c]", "abf", 0},
+    {"ab[^c-e]", "abb", 1},
+    {"ab[^c-e]", "abc", 0},
+    {"ab[^c-e]", "abd", 0},
+    {"ab[^c-e]", "abe", 0},
+    {"ab[^c-e]", "abf", 1},
+    {"ab[^e-c]", "abb", 1},
+    {"ab[^e-c]", "abc", 0},
+    {"ab[^e-c]", "abd", 0},
+    {"ab[^e-c]", "abe", 0},
+    {"ab[^e-c]", "abf", 1},
+    {"ab[a^]", "aba", 1},
+    {"ab[a^]", "ab^", 1},
+    {"ab[a^]", "abb", 0},
+    {"ab[^a^]", "aba", 0},
+    {"ab[^a^]", "ab^", 0},
+    {"ab[^a^]", "abb", 1},
+    {"ab[-c]", "ab-", 1},
+    {"ab[-c]", "abc", 1},
+    {"ab[-c]", "abd", 0},
+    {"ab[^-c]", "ab-", 0},
+    {"ab[^-c]", "abc", 0},
+    {"ab[^-c]", "abd", 1},
+    {"ab[\\[-\\]]", "abZ", 0},
+    {"ab[\\[-\\]]", "ab[", 1},
+    {"ab[\\[-\\]]", "ab\\", 1},
+    {"ab[\\[-\\]]", "ab]", 1},
+    {"ab[\\[-\\]]", "ab^", 0},
+    {"ab[^\\[-\\]]", "abZ", 1},
+    {"ab[^\\[-\\]]", "ab[", 0},
+    {"ab[^\\[-\\]]", "ab\\", 0},
+    {"ab[^\\[-\\]]", "ab]", 0},
+    {"ab[^\\[-\\]]", "ab^", 1},
+    {"ab[a-fA-F]", "aba", 1},
+    {"ab[a-fA-F]", "abF", 1},
+    {"ab[a-fA-F]", "abZ", 0},
+};
+
+const struct test full_tests[] = {
+    {"a", "argh", 0},
+    {"a", "ba", 0},
+    {"a", "a", 1},
+    {"a*", "aardvark", 1},
+    {"a*", "badger", 0},
+    {"*a", "park", 0},
+    {"*a", "pArka", 1},
+    {"*a", "parka", 1},
+    {"*a*", "park", 1},
+    {"*a*", "perk", 0},
+    {"?b*r?", "abracadabra", 1},
+    {"?b*r?", "abracadabr", 0},
+    {"?b*r?", "abracadabzr", 0},
+};
+
+int main(void)
+{
+    int i;
+    int fails, passes;
+
+    fails = passes = 0;
+
+    for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) {
+       const char *f, *t;
+       int eret, aret;
+       f = fragment_tests[i].wildcard;
+       t = fragment_tests[i].target;
+       eret = fragment_tests[i].expected_result;
+       aret = wc_match_fragment(&f, &t);
+       if (aret != eret) {
+           printf("failed test: /%s/ against /%s/ returned %d not %d\n",
+                  fragment_tests[i].wildcard, fragment_tests[i].target,
+                  aret, eret);
+           fails++;
+       } else
+           passes++;
+    }
+
+    for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) {
+       const char *f, *t;
+       int eret, aret;
+       f = full_tests[i].wildcard;
+       t = full_tests[i].target;
+       eret = full_tests[i].expected_result;
+       aret = wc_match(f, t);
+       if (aret != eret) {
+           printf("failed test: /%s/ against /%s/ returned %d not %d\n",
+                  full_tests[i].wildcard, full_tests[i].target,
+                  aret, eret);
+           fails++;
+       } else
+           passes++;
+    }
+
+    printf("passed %d, failed %d\n", passes, fails);
+
+    return 0;
+}
+
+#endif