| 1 | /* |
| 2 | * Wildcard matching engine for use with SFTP-based file transfer |
| 3 | * programs (PSFTP, new-look PSCP): since SFTP has no notion of |
| 4 | * getting the remote side to do globbing (and rightly so) we have |
| 5 | * to do it locally, by retrieving all the filenames in a directory |
| 6 | * and checking each against the wildcard pattern. |
| 7 | */ |
| 8 | |
| 9 | #include <assert.h> |
| 10 | #include <stdlib.h> |
| 11 | #include <string.h> |
| 12 | |
| 13 | /* |
| 14 | * Definition of wildcard syntax: |
| 15 | * |
| 16 | * - * matches any sequence of characters, including zero. |
| 17 | * - ? matches exactly one character which can be anything. |
| 18 | * - [abc] matches exactly one character which is a, b or c. |
| 19 | * - [a-f] matches anything from a through f. |
| 20 | * - [^a-f] matches anything _except_ a through f. |
| 21 | * - [-_] matches - or _; [^-_] matches anything else. (The - is |
| 22 | * non-special if it occurs immediately after the opening |
| 23 | * bracket or ^.) |
| 24 | * - [a^] matches an a or a ^. (The ^ is non-special if it does |
| 25 | * _not_ occur immediately after the opening bracket.) |
| 26 | * - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \. |
| 27 | * - All other characters are non-special and match themselves. |
| 28 | */ |
| 29 | |
| 30 | /* |
| 31 | * The wildcard matching technique we use is very simple and |
| 32 | * potentially O(N^2) in running time, but I don't anticipate it |
| 33 | * being that bad in reality (particularly since N will be the size |
| 34 | * of a filename, which isn't all that much). Perhaps one day, once |
| 35 | * PuTTY has grown a regexp matcher for some other reason, I might |
| 36 | * come back and reimplement wildcards by translating them into |
| 37 | * regexps or directly into NFAs; but for the moment, in the |
| 38 | * absence of any other need for the NFA->DFA translation engine, |
| 39 | * anything more than the simplest possible wildcard matcher is |
| 40 | * vast code-size overkill. |
| 41 | * |
| 42 | * Essentially, these wildcards are much simpler than regexps in |
| 43 | * that they consist of a sequence of rigid fragments (? and [...] |
| 44 | * can never match more or less than one character) separated by |
| 45 | * asterisks. It is therefore extremely simple to look at a rigid |
| 46 | * fragment and determine whether or not it begins at a particular |
| 47 | * point in the test string; so we can search along the string |
| 48 | * until we find each fragment, then search for the next. As long |
| 49 | * as we find each fragment in the _first_ place it occurs, there |
| 50 | * will never be a danger of having to backpedal and try to find it |
| 51 | * again somewhere else. |
| 52 | */ |
| 53 | |
| 54 | enum { |
| 55 | WC_TRAILINGBACKSLASH = 1, |
| 56 | WC_UNCLOSEDCLASS, |
| 57 | WC_INVALIDRANGE, |
| 58 | }; |
| 59 | |
| 60 | /* |
| 61 | * Error reporting is done by returning various negative values |
| 62 | * from the wildcard routines. Passing any such value to wc_error |
| 63 | * will give a human-readable message. |
| 64 | */ |
| 65 | const char *wc_error(int value) |
| 66 | { |
| 67 | value = abs(value); |
| 68 | switch (value) { |
| 69 | case WC_TRAILINGBACKSLASH: |
| 70 | return "'\' occurred at end of string (expected another character)"; |
| 71 | case WC_UNCLOSEDCLASS: |
| 72 | return "expected ']' to close character class"; |
| 73 | case WC_INVALIDRANGE: |
| 74 | return "character range was not terminated (']' just after '-')"; |
| 75 | } |
| 76 | return "INTERNAL ERROR: unrecognised wildcard error number"; |
| 77 | } |
| 78 | |
| 79 | /* |
| 80 | * This is the routine that tests a target string to see if an |
| 81 | * initial substring of it matches a fragment. If successful, it |
| 82 | * returns 1, and advances both `fragment' and `target' past the |
| 83 | * fragment and matching substring respectively. If unsuccessful it |
| 84 | * returns zero. If the wildcard fragment suffers a syntax error, |
| 85 | * it returns <0 and the precise value indexes into wc_error. |
| 86 | */ |
| 87 | static int wc_match_fragment(const char **fragment, const char **target) |
| 88 | { |
| 89 | const char *f, *t; |
| 90 | |
| 91 | f = *fragment; |
| 92 | t = *target; |
| 93 | /* |
| 94 | * The fragment terminates at either the end of the string, or |
| 95 | * the first (unescaped) *. |
| 96 | */ |
| 97 | while (*f && *f != '*' && *t) { |
| 98 | /* |
| 99 | * Extract one character from t, and one character's worth |
| 100 | * of pattern from f, and step along both. Return 0 if they |
| 101 | * fail to match. |
| 102 | */ |
| 103 | if (*f == '\\') { |
| 104 | /* |
| 105 | * Backslash, which means f[1] is to be treated as a |
| 106 | * literal character no matter what it is. It may not |
| 107 | * be the end of the string. |
| 108 | */ |
| 109 | if (!f[1]) |
| 110 | return -WC_TRAILINGBACKSLASH; /* error */ |
| 111 | if (f[1] != *t) |
| 112 | return 0; /* failed to match */ |
| 113 | f += 2; |
| 114 | } else if (*f == '?') { |
| 115 | /* |
| 116 | * Question mark matches anything. |
| 117 | */ |
| 118 | f++; |
| 119 | } else if (*f == '[') { |
| 120 | int invert = 0; |
| 121 | int matched = 0; |
| 122 | /* |
| 123 | * Open bracket introduces a character class. |
| 124 | */ |
| 125 | f++; |
| 126 | if (*f == '^') { |
| 127 | invert = 1; |
| 128 | f++; |
| 129 | } |
| 130 | while (*f != ']') { |
| 131 | if (*f == '\\') |
| 132 | f++; /* backslashes still work */ |
| 133 | if (!*f) |
| 134 | return -WC_UNCLOSEDCLASS; /* error again */ |
| 135 | if (f[1] == '-') { |
| 136 | int lower, upper, ourchr; |
| 137 | lower = (unsigned char) *f++; |
| 138 | f++; /* eat the minus */ |
| 139 | if (*f == ']') |
| 140 | return -WC_INVALIDRANGE; /* different error! */ |
| 141 | if (*f == '\\') |
| 142 | f++; /* backslashes _still_ work */ |
| 143 | if (!*f) |
| 144 | return -WC_UNCLOSEDCLASS; /* error again */ |
| 145 | upper = (unsigned char) *f++; |
| 146 | ourchr = (unsigned char) *t; |
| 147 | if (lower > upper) { |
| 148 | int t = lower; lower = upper; upper = t; |
| 149 | } |
| 150 | if (ourchr >= lower && ourchr <= upper) |
| 151 | matched = 1; |
| 152 | } else { |
| 153 | matched |= (*t == *f++); |
| 154 | } |
| 155 | } |
| 156 | if (invert == matched) |
| 157 | return 0; /* failed to match character class */ |
| 158 | f++; /* eat the ] */ |
| 159 | } else { |
| 160 | /* |
| 161 | * Non-special character matches itself. |
| 162 | */ |
| 163 | if (*f != *t) |
| 164 | return 0; |
| 165 | f++; |
| 166 | } |
| 167 | /* |
| 168 | * Now we've done that, increment t past the character we |
| 169 | * matched. |
| 170 | */ |
| 171 | t++; |
| 172 | } |
| 173 | if (!*f || *f == '*') { |
| 174 | /* |
| 175 | * We have reached the end of f without finding a mismatch; |
| 176 | * so we're done. Update the caller pointers and return 1. |
| 177 | */ |
| 178 | *fragment = f; |
| 179 | *target = t; |
| 180 | return 1; |
| 181 | } |
| 182 | /* |
| 183 | * Otherwise, we must have reached the end of t before we |
| 184 | * reached the end of f; so we've failed. Return 0. |
| 185 | */ |
| 186 | return 0; |
| 187 | } |
| 188 | |
| 189 | /* |
| 190 | * This is the real wildcard matching routine. It returns 1 for a |
| 191 | * successful match, 0 for an unsuccessful match, and <0 for a |
| 192 | * syntax error in the wildcard. |
| 193 | */ |
| 194 | int wc_match(const char *wildcard, const char *target) |
| 195 | { |
| 196 | int ret; |
| 197 | |
| 198 | /* |
| 199 | * Every time we see a '*' _followed_ by a fragment, we just |
| 200 | * search along the string for a location at which the fragment |
| 201 | * matches. The only special case is when we see a fragment |
| 202 | * right at the start, in which case we just call the matching |
| 203 | * routine once and give up if it fails. |
| 204 | */ |
| 205 | if (*wildcard != '*') { |
| 206 | ret = wc_match_fragment(&wildcard, &target); |
| 207 | if (ret <= 0) |
| 208 | return ret; /* pass back failure or error alike */ |
| 209 | } |
| 210 | |
| 211 | while (*wildcard) { |
| 212 | assert(*wildcard == '*'); |
| 213 | while (*wildcard == '*') |
| 214 | wildcard++; |
| 215 | |
| 216 | /* |
| 217 | * It's possible we've just hit the end of the wildcard |
| 218 | * after seeing a *, in which case there's no need to |
| 219 | * bother searching any more because we've won. |
| 220 | */ |
| 221 | if (!*wildcard) |
| 222 | return 1; |
| 223 | |
| 224 | /* |
| 225 | * Now `wildcard' points at the next fragment. So we |
| 226 | * attempt to match it against `target', and if that fails |
| 227 | * we increment `target' and try again, and so on. When we |
| 228 | * find we're about to try matching against the empty |
| 229 | * string, we give up and return 0. |
| 230 | */ |
| 231 | ret = 0; |
| 232 | while (*target) { |
| 233 | const char *save_w = wildcard, *save_t = target; |
| 234 | |
| 235 | ret = wc_match_fragment(&wildcard, &target); |
| 236 | |
| 237 | if (ret < 0) |
| 238 | return ret; /* syntax error */ |
| 239 | |
| 240 | if (ret > 0 && !*wildcard && *target) { |
| 241 | /* |
| 242 | * Final special case - literally. |
| 243 | * |
| 244 | * This situation arises when we are matching a |
| 245 | * _terminal_ fragment of the wildcard (that is, |
| 246 | * there is nothing after it, e.g. "*a"), and it |
| 247 | * has matched _too early_. For example, matching |
| 248 | * "*a" against "parka" will match the "a" fragment |
| 249 | * against the _first_ a, and then (if it weren't |
| 250 | * for this special case) matching would fail |
| 251 | * because we're at the end of the wildcard but not |
| 252 | * at the end of the target string. |
| 253 | * |
| 254 | * In this case what we must do is measure the |
| 255 | * length of the fragment in the target (which is |
| 256 | * why we saved `target'), jump straight to that |
| 257 | * distance from the end of the string using |
| 258 | * strlen, and match the same fragment again there |
| 259 | * (which is why we saved `wildcard'). Then we |
| 260 | * return whatever that operation returns. |
| 261 | */ |
| 262 | target = save_t + strlen(save_t) - (target - save_t); |
| 263 | wildcard = save_w; |
| 264 | return wc_match_fragment(&wildcard, &target); |
| 265 | } |
| 266 | |
| 267 | if (ret > 0) |
| 268 | break; |
| 269 | target++; |
| 270 | } |
| 271 | if (ret > 0) |
| 272 | continue; |
| 273 | return 0; |
| 274 | } |
| 275 | |
| 276 | /* |
| 277 | * If we reach here, it must be because we successfully matched |
| 278 | * a fragment and then found ourselves right at the end of the |
| 279 | * wildcard. Hence, we return 1 if and only if we are also |
| 280 | * right at the end of the target. |
| 281 | */ |
| 282 | return (*target ? 0 : 1); |
| 283 | } |
| 284 | |
| 285 | /* |
| 286 | * Another utility routine that translates a non-wildcard string |
| 287 | * into its raw equivalent by removing any escaping backslashes. |
| 288 | * Expects a target string buffer of anything up to the length of |
| 289 | * the original wildcard. You can also pass NULL as the output |
| 290 | * buffer if you're only interested in the return value. |
| 291 | * |
| 292 | * Returns 1 on success, or 0 if a wildcard character was |
| 293 | * encountered. In the latter case the output string MAY not be |
| 294 | * zero-terminated and you should not use it for anything! |
| 295 | */ |
| 296 | int wc_unescape(char *output, const char *wildcard) |
| 297 | { |
| 298 | while (*wildcard) { |
| 299 | if (*wildcard == '\\') { |
| 300 | wildcard++; |
| 301 | /* We are lenient about trailing backslashes in non-wildcards. */ |
| 302 | if (*wildcard) { |
| 303 | if (output) |
| 304 | *output++ = *wildcard; |
| 305 | wildcard++; |
| 306 | } |
| 307 | } else if (*wildcard == '*' || *wildcard == '?' || |
| 308 | *wildcard == '[' || *wildcard == ']') { |
| 309 | return 0; /* it's a wildcard! */ |
| 310 | } else { |
| 311 | if (output) |
| 312 | *output++ = *wildcard; |
| 313 | wildcard++; |
| 314 | } |
| 315 | } |
| 316 | *output = '\0'; |
| 317 | return 1; /* it's clean */ |
| 318 | } |
| 319 | |
| 320 | #ifdef TESTMODE |
| 321 | |
| 322 | struct test { |
| 323 | const char *wildcard; |
| 324 | const char *target; |
| 325 | int expected_result; |
| 326 | }; |
| 327 | |
| 328 | const struct test fragment_tests[] = { |
| 329 | /* |
| 330 | * We exhaustively unit-test the fragment matching routine |
| 331 | * itself, which should save us the need to test all its |
| 332 | * intricacies during the full wildcard tests. |
| 333 | */ |
| 334 | {"abc", "abc", 1}, |
| 335 | {"abc", "abd", 0}, |
| 336 | {"abc", "abcd", 1}, |
| 337 | {"abcd", "abc", 0}, |
| 338 | {"ab[cd]", "abc", 1}, |
| 339 | {"ab[cd]", "abd", 1}, |
| 340 | {"ab[cd]", "abe", 0}, |
| 341 | {"ab[^cd]", "abc", 0}, |
| 342 | {"ab[^cd]", "abd", 0}, |
| 343 | {"ab[^cd]", "abe", 1}, |
| 344 | {"ab\\", "abc", -WC_TRAILINGBACKSLASH}, |
| 345 | {"ab\\*", "ab*", 1}, |
| 346 | {"ab\\?", "ab*", 0}, |
| 347 | {"ab?", "abc", 1}, |
| 348 | {"ab?", "ab", 0}, |
| 349 | {"ab[", "abc", -WC_UNCLOSEDCLASS}, |
| 350 | {"ab[c-", "abb", -WC_UNCLOSEDCLASS}, |
| 351 | {"ab[c-]", "abb", -WC_INVALIDRANGE}, |
| 352 | {"ab[c-e]", "abb", 0}, |
| 353 | {"ab[c-e]", "abc", 1}, |
| 354 | {"ab[c-e]", "abd", 1}, |
| 355 | {"ab[c-e]", "abe", 1}, |
| 356 | {"ab[c-e]", "abf", 0}, |
| 357 | {"ab[e-c]", "abb", 0}, |
| 358 | {"ab[e-c]", "abc", 1}, |
| 359 | {"ab[e-c]", "abd", 1}, |
| 360 | {"ab[e-c]", "abe", 1}, |
| 361 | {"ab[e-c]", "abf", 0}, |
| 362 | {"ab[^c-e]", "abb", 1}, |
| 363 | {"ab[^c-e]", "abc", 0}, |
| 364 | {"ab[^c-e]", "abd", 0}, |
| 365 | {"ab[^c-e]", "abe", 0}, |
| 366 | {"ab[^c-e]", "abf", 1}, |
| 367 | {"ab[^e-c]", "abb", 1}, |
| 368 | {"ab[^e-c]", "abc", 0}, |
| 369 | {"ab[^e-c]", "abd", 0}, |
| 370 | {"ab[^e-c]", "abe", 0}, |
| 371 | {"ab[^e-c]", "abf", 1}, |
| 372 | {"ab[a^]", "aba", 1}, |
| 373 | {"ab[a^]", "ab^", 1}, |
| 374 | {"ab[a^]", "abb", 0}, |
| 375 | {"ab[^a^]", "aba", 0}, |
| 376 | {"ab[^a^]", "ab^", 0}, |
| 377 | {"ab[^a^]", "abb", 1}, |
| 378 | {"ab[-c]", "ab-", 1}, |
| 379 | {"ab[-c]", "abc", 1}, |
| 380 | {"ab[-c]", "abd", 0}, |
| 381 | {"ab[^-c]", "ab-", 0}, |
| 382 | {"ab[^-c]", "abc", 0}, |
| 383 | {"ab[^-c]", "abd", 1}, |
| 384 | {"ab[\\[-\\]]", "abZ", 0}, |
| 385 | {"ab[\\[-\\]]", "ab[", 1}, |
| 386 | {"ab[\\[-\\]]", "ab\\", 1}, |
| 387 | {"ab[\\[-\\]]", "ab]", 1}, |
| 388 | {"ab[\\[-\\]]", "ab^", 0}, |
| 389 | {"ab[^\\[-\\]]", "abZ", 1}, |
| 390 | {"ab[^\\[-\\]]", "ab[", 0}, |
| 391 | {"ab[^\\[-\\]]", "ab\\", 0}, |
| 392 | {"ab[^\\[-\\]]", "ab]", 0}, |
| 393 | {"ab[^\\[-\\]]", "ab^", 1}, |
| 394 | {"ab[a-fA-F]", "aba", 1}, |
| 395 | {"ab[a-fA-F]", "abF", 1}, |
| 396 | {"ab[a-fA-F]", "abZ", 0}, |
| 397 | }; |
| 398 | |
| 399 | const struct test full_tests[] = { |
| 400 | {"a", "argh", 0}, |
| 401 | {"a", "ba", 0}, |
| 402 | {"a", "a", 1}, |
| 403 | {"a*", "aardvark", 1}, |
| 404 | {"a*", "badger", 0}, |
| 405 | {"*a", "park", 0}, |
| 406 | {"*a", "pArka", 1}, |
| 407 | {"*a", "parka", 1}, |
| 408 | {"*a*", "park", 1}, |
| 409 | {"*a*", "perk", 0}, |
| 410 | {"?b*r?", "abracadabra", 1}, |
| 411 | {"?b*r?", "abracadabr", 0}, |
| 412 | {"?b*r?", "abracadabzr", 0}, |
| 413 | }; |
| 414 | |
| 415 | int main(void) |
| 416 | { |
| 417 | int i; |
| 418 | int fails, passes; |
| 419 | |
| 420 | fails = passes = 0; |
| 421 | |
| 422 | for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) { |
| 423 | const char *f, *t; |
| 424 | int eret, aret; |
| 425 | f = fragment_tests[i].wildcard; |
| 426 | t = fragment_tests[i].target; |
| 427 | eret = fragment_tests[i].expected_result; |
| 428 | aret = wc_match_fragment(&f, &t); |
| 429 | if (aret != eret) { |
| 430 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", |
| 431 | fragment_tests[i].wildcard, fragment_tests[i].target, |
| 432 | aret, eret); |
| 433 | fails++; |
| 434 | } else |
| 435 | passes++; |
| 436 | } |
| 437 | |
| 438 | for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) { |
| 439 | const char *f, *t; |
| 440 | int eret, aret; |
| 441 | f = full_tests[i].wildcard; |
| 442 | t = full_tests[i].target; |
| 443 | eret = full_tests[i].expected_result; |
| 444 | aret = wc_match(f, t); |
| 445 | if (aret != eret) { |
| 446 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", |
| 447 | full_tests[i].wildcard, full_tests[i].target, |
| 448 | aret, eret); |
| 449 | fails++; |
| 450 | } else |
| 451 | passes++; |
| 452 | } |
| 453 | |
| 454 | printf("passed %d, failed %d\n", passes, fails); |
| 455 | |
| 456 | return 0; |
| 457 | } |
| 458 | |
| 459 | #endif |