4eb24e3a |
1 | /* |
2 | * Wildcard matching engine for use with SFTP-based file transfer |
3 | * programs (PSFTP, new-look PSCP): since SFTP has no notion of |
4 | * getting the remote side to do globbing (and rightly so) we have |
5 | * to do it locally, by retrieving all the filenames in a directory |
6 | * and checking each against the wildcard pattern. |
7 | */ |
8 | |
9 | #include <assert.h> |
10 | #include <stdlib.h> |
11 | #include <string.h> |
12 | |
aed97b82 |
13 | #include "putty.h" |
14 | |
4eb24e3a |
15 | /* |
16 | * Definition of wildcard syntax: |
17 | * |
18 | * - * matches any sequence of characters, including zero. |
19 | * - ? matches exactly one character which can be anything. |
20 | * - [abc] matches exactly one character which is a, b or c. |
21 | * - [a-f] matches anything from a through f. |
22 | * - [^a-f] matches anything _except_ a through f. |
23 | * - [-_] matches - or _; [^-_] matches anything else. (The - is |
24 | * non-special if it occurs immediately after the opening |
25 | * bracket or ^.) |
26 | * - [a^] matches an a or a ^. (The ^ is non-special if it does |
27 | * _not_ occur immediately after the opening bracket.) |
28 | * - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \. |
29 | * - All other characters are non-special and match themselves. |
30 | */ |
31 | |
32 | /* |
c524da73 |
33 | * Some notes on differences from POSIX globs (IEEE Std 1003.1, 2003 ed.): |
34 | * - backslashes act as escapes even within [] bracket expressions |
35 | * - does not support [!...] for non-matching list (POSIX are weird); |
36 | * NB POSIX allows [^...] as well via "A bracket expression starting |
37 | * with an unquoted circumflex character produces unspecified |
38 | * results". If we wanted to allow [!...] we might want to define |
39 | * [^!] as having its literal meaning (match '^' or '!'). |
40 | * - none of the scary [[:class:]] stuff, etc |
41 | */ |
42 | |
43 | /* |
4eb24e3a |
44 | * The wildcard matching technique we use is very simple and |
45 | * potentially O(N^2) in running time, but I don't anticipate it |
46 | * being that bad in reality (particularly since N will be the size |
47 | * of a filename, which isn't all that much). Perhaps one day, once |
48 | * PuTTY has grown a regexp matcher for some other reason, I might |
49 | * come back and reimplement wildcards by translating them into |
50 | * regexps or directly into NFAs; but for the moment, in the |
51 | * absence of any other need for the NFA->DFA translation engine, |
52 | * anything more than the simplest possible wildcard matcher is |
53 | * vast code-size overkill. |
54 | * |
55 | * Essentially, these wildcards are much simpler than regexps in |
56 | * that they consist of a sequence of rigid fragments (? and [...] |
57 | * can never match more or less than one character) separated by |
58 | * asterisks. It is therefore extremely simple to look at a rigid |
59 | * fragment and determine whether or not it begins at a particular |
60 | * point in the test string; so we can search along the string |
61 | * until we find each fragment, then search for the next. As long |
62 | * as we find each fragment in the _first_ place it occurs, there |
63 | * will never be a danger of having to backpedal and try to find it |
64 | * again somewhere else. |
65 | */ |
66 | |
67 | enum { |
68 | WC_TRAILINGBACKSLASH = 1, |
69 | WC_UNCLOSEDCLASS, |
62cdf1ed |
70 | WC_INVALIDRANGE |
4eb24e3a |
71 | }; |
72 | |
73 | /* |
74 | * Error reporting is done by returning various negative values |
75 | * from the wildcard routines. Passing any such value to wc_error |
76 | * will give a human-readable message. |
77 | */ |
78 | const char *wc_error(int value) |
79 | { |
80 | value = abs(value); |
81 | switch (value) { |
82 | case WC_TRAILINGBACKSLASH: |
83 | return "'\' occurred at end of string (expected another character)"; |
84 | case WC_UNCLOSEDCLASS: |
85 | return "expected ']' to close character class"; |
86 | case WC_INVALIDRANGE: |
87 | return "character range was not terminated (']' just after '-')"; |
88 | } |
89 | return "INTERNAL ERROR: unrecognised wildcard error number"; |
90 | } |
91 | |
92 | /* |
93 | * This is the routine that tests a target string to see if an |
94 | * initial substring of it matches a fragment. If successful, it |
95 | * returns 1, and advances both `fragment' and `target' past the |
96 | * fragment and matching substring respectively. If unsuccessful it |
97 | * returns zero. If the wildcard fragment suffers a syntax error, |
98 | * it returns <0 and the precise value indexes into wc_error. |
99 | */ |
100 | static int wc_match_fragment(const char **fragment, const char **target) |
101 | { |
102 | const char *f, *t; |
103 | |
104 | f = *fragment; |
105 | t = *target; |
106 | /* |
107 | * The fragment terminates at either the end of the string, or |
108 | * the first (unescaped) *. |
109 | */ |
110 | while (*f && *f != '*' && *t) { |
111 | /* |
112 | * Extract one character from t, and one character's worth |
113 | * of pattern from f, and step along both. Return 0 if they |
114 | * fail to match. |
115 | */ |
116 | if (*f == '\\') { |
117 | /* |
118 | * Backslash, which means f[1] is to be treated as a |
119 | * literal character no matter what it is. It may not |
120 | * be the end of the string. |
121 | */ |
122 | if (!f[1]) |
123 | return -WC_TRAILINGBACKSLASH; /* error */ |
124 | if (f[1] != *t) |
125 | return 0; /* failed to match */ |
126 | f += 2; |
127 | } else if (*f == '?') { |
128 | /* |
129 | * Question mark matches anything. |
130 | */ |
131 | f++; |
132 | } else if (*f == '[') { |
133 | int invert = 0; |
134 | int matched = 0; |
135 | /* |
136 | * Open bracket introduces a character class. |
137 | */ |
138 | f++; |
139 | if (*f == '^') { |
140 | invert = 1; |
141 | f++; |
142 | } |
143 | while (*f != ']') { |
144 | if (*f == '\\') |
145 | f++; /* backslashes still work */ |
146 | if (!*f) |
147 | return -WC_UNCLOSEDCLASS; /* error again */ |
148 | if (f[1] == '-') { |
149 | int lower, upper, ourchr; |
150 | lower = (unsigned char) *f++; |
151 | f++; /* eat the minus */ |
152 | if (*f == ']') |
153 | return -WC_INVALIDRANGE; /* different error! */ |
154 | if (*f == '\\') |
155 | f++; /* backslashes _still_ work */ |
156 | if (!*f) |
157 | return -WC_UNCLOSEDCLASS; /* error again */ |
158 | upper = (unsigned char) *f++; |
159 | ourchr = (unsigned char) *t; |
160 | if (lower > upper) { |
161 | int t = lower; lower = upper; upper = t; |
162 | } |
163 | if (ourchr >= lower && ourchr <= upper) |
164 | matched = 1; |
165 | } else { |
166 | matched |= (*t == *f++); |
167 | } |
168 | } |
169 | if (invert == matched) |
170 | return 0; /* failed to match character class */ |
171 | f++; /* eat the ] */ |
172 | } else { |
173 | /* |
174 | * Non-special character matches itself. |
175 | */ |
176 | if (*f != *t) |
177 | return 0; |
178 | f++; |
179 | } |
180 | /* |
181 | * Now we've done that, increment t past the character we |
182 | * matched. |
183 | */ |
184 | t++; |
185 | } |
186 | if (!*f || *f == '*') { |
187 | /* |
188 | * We have reached the end of f without finding a mismatch; |
189 | * so we're done. Update the caller pointers and return 1. |
190 | */ |
191 | *fragment = f; |
192 | *target = t; |
193 | return 1; |
194 | } |
195 | /* |
196 | * Otherwise, we must have reached the end of t before we |
197 | * reached the end of f; so we've failed. Return 0. |
198 | */ |
199 | return 0; |
200 | } |
201 | |
202 | /* |
203 | * This is the real wildcard matching routine. It returns 1 for a |
204 | * successful match, 0 for an unsuccessful match, and <0 for a |
205 | * syntax error in the wildcard. |
206 | */ |
207 | int wc_match(const char *wildcard, const char *target) |
208 | { |
209 | int ret; |
210 | |
211 | /* |
212 | * Every time we see a '*' _followed_ by a fragment, we just |
213 | * search along the string for a location at which the fragment |
214 | * matches. The only special case is when we see a fragment |
215 | * right at the start, in which case we just call the matching |
216 | * routine once and give up if it fails. |
217 | */ |
218 | if (*wildcard != '*') { |
219 | ret = wc_match_fragment(&wildcard, &target); |
220 | if (ret <= 0) |
221 | return ret; /* pass back failure or error alike */ |
222 | } |
223 | |
224 | while (*wildcard) { |
225 | assert(*wildcard == '*'); |
226 | while (*wildcard == '*') |
227 | wildcard++; |
228 | |
229 | /* |
230 | * It's possible we've just hit the end of the wildcard |
231 | * after seeing a *, in which case there's no need to |
232 | * bother searching any more because we've won. |
233 | */ |
234 | if (!*wildcard) |
235 | return 1; |
236 | |
237 | /* |
238 | * Now `wildcard' points at the next fragment. So we |
239 | * attempt to match it against `target', and if that fails |
240 | * we increment `target' and try again, and so on. When we |
241 | * find we're about to try matching against the empty |
242 | * string, we give up and return 0. |
243 | */ |
244 | ret = 0; |
245 | while (*target) { |
246 | const char *save_w = wildcard, *save_t = target; |
247 | |
248 | ret = wc_match_fragment(&wildcard, &target); |
249 | |
250 | if (ret < 0) |
251 | return ret; /* syntax error */ |
252 | |
253 | if (ret > 0 && !*wildcard && *target) { |
254 | /* |
255 | * Final special case - literally. |
256 | * |
257 | * This situation arises when we are matching a |
258 | * _terminal_ fragment of the wildcard (that is, |
259 | * there is nothing after it, e.g. "*a"), and it |
260 | * has matched _too early_. For example, matching |
261 | * "*a" against "parka" will match the "a" fragment |
262 | * against the _first_ a, and then (if it weren't |
263 | * for this special case) matching would fail |
264 | * because we're at the end of the wildcard but not |
265 | * at the end of the target string. |
266 | * |
267 | * In this case what we must do is measure the |
268 | * length of the fragment in the target (which is |
269 | * why we saved `target'), jump straight to that |
270 | * distance from the end of the string using |
271 | * strlen, and match the same fragment again there |
272 | * (which is why we saved `wildcard'). Then we |
273 | * return whatever that operation returns. |
274 | */ |
275 | target = save_t + strlen(save_t) - (target - save_t); |
276 | wildcard = save_w; |
277 | return wc_match_fragment(&wildcard, &target); |
278 | } |
279 | |
280 | if (ret > 0) |
281 | break; |
282 | target++; |
283 | } |
284 | if (ret > 0) |
285 | continue; |
286 | return 0; |
287 | } |
288 | |
289 | /* |
290 | * If we reach here, it must be because we successfully matched |
291 | * a fragment and then found ourselves right at the end of the |
292 | * wildcard. Hence, we return 1 if and only if we are also |
293 | * right at the end of the target. |
294 | */ |
295 | return (*target ? 0 : 1); |
296 | } |
297 | |
298 | /* |
299 | * Another utility routine that translates a non-wildcard string |
300 | * into its raw equivalent by removing any escaping backslashes. |
301 | * Expects a target string buffer of anything up to the length of |
302 | * the original wildcard. You can also pass NULL as the output |
303 | * buffer if you're only interested in the return value. |
304 | * |
305 | * Returns 1 on success, or 0 if a wildcard character was |
306 | * encountered. In the latter case the output string MAY not be |
307 | * zero-terminated and you should not use it for anything! |
308 | */ |
309 | int wc_unescape(char *output, const char *wildcard) |
310 | { |
311 | while (*wildcard) { |
312 | if (*wildcard == '\\') { |
313 | wildcard++; |
314 | /* We are lenient about trailing backslashes in non-wildcards. */ |
315 | if (*wildcard) { |
316 | if (output) |
317 | *output++ = *wildcard; |
318 | wildcard++; |
319 | } |
320 | } else if (*wildcard == '*' || *wildcard == '?' || |
321 | *wildcard == '[' || *wildcard == ']') { |
322 | return 0; /* it's a wildcard! */ |
323 | } else { |
324 | if (output) |
325 | *output++ = *wildcard; |
326 | wildcard++; |
327 | } |
328 | } |
329 | *output = '\0'; |
330 | return 1; /* it's clean */ |
331 | } |
332 | |
333 | #ifdef TESTMODE |
334 | |
335 | struct test { |
336 | const char *wildcard; |
337 | const char *target; |
338 | int expected_result; |
339 | }; |
340 | |
341 | const struct test fragment_tests[] = { |
342 | /* |
343 | * We exhaustively unit-test the fragment matching routine |
344 | * itself, which should save us the need to test all its |
345 | * intricacies during the full wildcard tests. |
346 | */ |
347 | {"abc", "abc", 1}, |
348 | {"abc", "abd", 0}, |
349 | {"abc", "abcd", 1}, |
350 | {"abcd", "abc", 0}, |
351 | {"ab[cd]", "abc", 1}, |
352 | {"ab[cd]", "abd", 1}, |
353 | {"ab[cd]", "abe", 0}, |
354 | {"ab[^cd]", "abc", 0}, |
355 | {"ab[^cd]", "abd", 0}, |
356 | {"ab[^cd]", "abe", 1}, |
357 | {"ab\\", "abc", -WC_TRAILINGBACKSLASH}, |
358 | {"ab\\*", "ab*", 1}, |
359 | {"ab\\?", "ab*", 0}, |
360 | {"ab?", "abc", 1}, |
361 | {"ab?", "ab", 0}, |
362 | {"ab[", "abc", -WC_UNCLOSEDCLASS}, |
363 | {"ab[c-", "abb", -WC_UNCLOSEDCLASS}, |
364 | {"ab[c-]", "abb", -WC_INVALIDRANGE}, |
365 | {"ab[c-e]", "abb", 0}, |
366 | {"ab[c-e]", "abc", 1}, |
367 | {"ab[c-e]", "abd", 1}, |
368 | {"ab[c-e]", "abe", 1}, |
369 | {"ab[c-e]", "abf", 0}, |
370 | {"ab[e-c]", "abb", 0}, |
371 | {"ab[e-c]", "abc", 1}, |
372 | {"ab[e-c]", "abd", 1}, |
373 | {"ab[e-c]", "abe", 1}, |
374 | {"ab[e-c]", "abf", 0}, |
375 | {"ab[^c-e]", "abb", 1}, |
376 | {"ab[^c-e]", "abc", 0}, |
377 | {"ab[^c-e]", "abd", 0}, |
378 | {"ab[^c-e]", "abe", 0}, |
379 | {"ab[^c-e]", "abf", 1}, |
380 | {"ab[^e-c]", "abb", 1}, |
381 | {"ab[^e-c]", "abc", 0}, |
382 | {"ab[^e-c]", "abd", 0}, |
383 | {"ab[^e-c]", "abe", 0}, |
384 | {"ab[^e-c]", "abf", 1}, |
385 | {"ab[a^]", "aba", 1}, |
386 | {"ab[a^]", "ab^", 1}, |
387 | {"ab[a^]", "abb", 0}, |
388 | {"ab[^a^]", "aba", 0}, |
389 | {"ab[^a^]", "ab^", 0}, |
390 | {"ab[^a^]", "abb", 1}, |
391 | {"ab[-c]", "ab-", 1}, |
392 | {"ab[-c]", "abc", 1}, |
393 | {"ab[-c]", "abd", 0}, |
394 | {"ab[^-c]", "ab-", 0}, |
395 | {"ab[^-c]", "abc", 0}, |
396 | {"ab[^-c]", "abd", 1}, |
397 | {"ab[\\[-\\]]", "abZ", 0}, |
398 | {"ab[\\[-\\]]", "ab[", 1}, |
399 | {"ab[\\[-\\]]", "ab\\", 1}, |
400 | {"ab[\\[-\\]]", "ab]", 1}, |
401 | {"ab[\\[-\\]]", "ab^", 0}, |
402 | {"ab[^\\[-\\]]", "abZ", 1}, |
403 | {"ab[^\\[-\\]]", "ab[", 0}, |
404 | {"ab[^\\[-\\]]", "ab\\", 0}, |
405 | {"ab[^\\[-\\]]", "ab]", 0}, |
406 | {"ab[^\\[-\\]]", "ab^", 1}, |
407 | {"ab[a-fA-F]", "aba", 1}, |
408 | {"ab[a-fA-F]", "abF", 1}, |
409 | {"ab[a-fA-F]", "abZ", 0}, |
410 | }; |
411 | |
412 | const struct test full_tests[] = { |
413 | {"a", "argh", 0}, |
414 | {"a", "ba", 0}, |
415 | {"a", "a", 1}, |
416 | {"a*", "aardvark", 1}, |
417 | {"a*", "badger", 0}, |
418 | {"*a", "park", 0}, |
419 | {"*a", "pArka", 1}, |
420 | {"*a", "parka", 1}, |
421 | {"*a*", "park", 1}, |
422 | {"*a*", "perk", 0}, |
423 | {"?b*r?", "abracadabra", 1}, |
424 | {"?b*r?", "abracadabr", 0}, |
425 | {"?b*r?", "abracadabzr", 0}, |
426 | }; |
427 | |
428 | int main(void) |
429 | { |
430 | int i; |
431 | int fails, passes; |
432 | |
433 | fails = passes = 0; |
434 | |
435 | for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) { |
436 | const char *f, *t; |
437 | int eret, aret; |
438 | f = fragment_tests[i].wildcard; |
439 | t = fragment_tests[i].target; |
440 | eret = fragment_tests[i].expected_result; |
441 | aret = wc_match_fragment(&f, &t); |
442 | if (aret != eret) { |
443 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", |
444 | fragment_tests[i].wildcard, fragment_tests[i].target, |
445 | aret, eret); |
446 | fails++; |
447 | } else |
448 | passes++; |
449 | } |
450 | |
451 | for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) { |
452 | const char *f, *t; |
453 | int eret, aret; |
454 | f = full_tests[i].wildcard; |
455 | t = full_tests[i].target; |
456 | eret = full_tests[i].expected_result; |
457 | aret = wc_match(f, t); |
458 | if (aret != eret) { |
459 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", |
460 | full_tests[i].wildcard, full_tests[i].target, |
461 | aret, eret); |
462 | fails++; |
463 | } else |
464 | passes++; |
465 | } |
466 | |
467 | printf("passed %d, failed %d\n", passes, fails); |
468 | |
469 | return 0; |
470 | } |
471 | |
472 | #endif |