4eb24e3a |
1 | /* |
2 | * Wildcard matching engine for use with SFTP-based file transfer |
3 | * programs (PSFTP, new-look PSCP): since SFTP has no notion of |
4 | * getting the remote side to do globbing (and rightly so) we have |
5 | * to do it locally, by retrieving all the filenames in a directory |
6 | * and checking each against the wildcard pattern. |
7 | */ |
8 | |
9 | #include <assert.h> |
10 | #include <stdlib.h> |
11 | #include <string.h> |
12 | |
13 | /* |
14 | * Definition of wildcard syntax: |
15 | * |
16 | * - * matches any sequence of characters, including zero. |
17 | * - ? matches exactly one character which can be anything. |
18 | * - [abc] matches exactly one character which is a, b or c. |
19 | * - [a-f] matches anything from a through f. |
20 | * - [^a-f] matches anything _except_ a through f. |
21 | * - [-_] matches - or _; [^-_] matches anything else. (The - is |
22 | * non-special if it occurs immediately after the opening |
23 | * bracket or ^.) |
24 | * - [a^] matches an a or a ^. (The ^ is non-special if it does |
25 | * _not_ occur immediately after the opening bracket.) |
26 | * - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \. |
27 | * - All other characters are non-special and match themselves. |
28 | */ |
29 | |
30 | /* |
31 | * The wildcard matching technique we use is very simple and |
32 | * potentially O(N^2) in running time, but I don't anticipate it |
33 | * being that bad in reality (particularly since N will be the size |
34 | * of a filename, which isn't all that much). Perhaps one day, once |
35 | * PuTTY has grown a regexp matcher for some other reason, I might |
36 | * come back and reimplement wildcards by translating them into |
37 | * regexps or directly into NFAs; but for the moment, in the |
38 | * absence of any other need for the NFA->DFA translation engine, |
39 | * anything more than the simplest possible wildcard matcher is |
40 | * vast code-size overkill. |
41 | * |
42 | * Essentially, these wildcards are much simpler than regexps in |
43 | * that they consist of a sequence of rigid fragments (? and [...] |
44 | * can never match more or less than one character) separated by |
45 | * asterisks. It is therefore extremely simple to look at a rigid |
46 | * fragment and determine whether or not it begins at a particular |
47 | * point in the test string; so we can search along the string |
48 | * until we find each fragment, then search for the next. As long |
49 | * as we find each fragment in the _first_ place it occurs, there |
50 | * will never be a danger of having to backpedal and try to find it |
51 | * again somewhere else. |
52 | */ |
53 | |
54 | enum { |
55 | WC_TRAILINGBACKSLASH = 1, |
56 | WC_UNCLOSEDCLASS, |
57 | WC_INVALIDRANGE, |
58 | }; |
59 | |
60 | /* |
61 | * Error reporting is done by returning various negative values |
62 | * from the wildcard routines. Passing any such value to wc_error |
63 | * will give a human-readable message. |
64 | */ |
65 | const char *wc_error(int value) |
66 | { |
67 | value = abs(value); |
68 | switch (value) { |
69 | case WC_TRAILINGBACKSLASH: |
70 | return "'\' occurred at end of string (expected another character)"; |
71 | case WC_UNCLOSEDCLASS: |
72 | return "expected ']' to close character class"; |
73 | case WC_INVALIDRANGE: |
74 | return "character range was not terminated (']' just after '-')"; |
75 | } |
76 | return "INTERNAL ERROR: unrecognised wildcard error number"; |
77 | } |
78 | |
79 | /* |
80 | * This is the routine that tests a target string to see if an |
81 | * initial substring of it matches a fragment. If successful, it |
82 | * returns 1, and advances both `fragment' and `target' past the |
83 | * fragment and matching substring respectively. If unsuccessful it |
84 | * returns zero. If the wildcard fragment suffers a syntax error, |
85 | * it returns <0 and the precise value indexes into wc_error. |
86 | */ |
87 | static int wc_match_fragment(const char **fragment, const char **target) |
88 | { |
89 | const char *f, *t; |
90 | |
91 | f = *fragment; |
92 | t = *target; |
93 | /* |
94 | * The fragment terminates at either the end of the string, or |
95 | * the first (unescaped) *. |
96 | */ |
97 | while (*f && *f != '*' && *t) { |
98 | /* |
99 | * Extract one character from t, and one character's worth |
100 | * of pattern from f, and step along both. Return 0 if they |
101 | * fail to match. |
102 | */ |
103 | if (*f == '\\') { |
104 | /* |
105 | * Backslash, which means f[1] is to be treated as a |
106 | * literal character no matter what it is. It may not |
107 | * be the end of the string. |
108 | */ |
109 | if (!f[1]) |
110 | return -WC_TRAILINGBACKSLASH; /* error */ |
111 | if (f[1] != *t) |
112 | return 0; /* failed to match */ |
113 | f += 2; |
114 | } else if (*f == '?') { |
115 | /* |
116 | * Question mark matches anything. |
117 | */ |
118 | f++; |
119 | } else if (*f == '[') { |
120 | int invert = 0; |
121 | int matched = 0; |
122 | /* |
123 | * Open bracket introduces a character class. |
124 | */ |
125 | f++; |
126 | if (*f == '^') { |
127 | invert = 1; |
128 | f++; |
129 | } |
130 | while (*f != ']') { |
131 | if (*f == '\\') |
132 | f++; /* backslashes still work */ |
133 | if (!*f) |
134 | return -WC_UNCLOSEDCLASS; /* error again */ |
135 | if (f[1] == '-') { |
136 | int lower, upper, ourchr; |
137 | lower = (unsigned char) *f++; |
138 | f++; /* eat the minus */ |
139 | if (*f == ']') |
140 | return -WC_INVALIDRANGE; /* different error! */ |
141 | if (*f == '\\') |
142 | f++; /* backslashes _still_ work */ |
143 | if (!*f) |
144 | return -WC_UNCLOSEDCLASS; /* error again */ |
145 | upper = (unsigned char) *f++; |
146 | ourchr = (unsigned char) *t; |
147 | if (lower > upper) { |
148 | int t = lower; lower = upper; upper = t; |
149 | } |
150 | if (ourchr >= lower && ourchr <= upper) |
151 | matched = 1; |
152 | } else { |
153 | matched |= (*t == *f++); |
154 | } |
155 | } |
156 | if (invert == matched) |
157 | return 0; /* failed to match character class */ |
158 | f++; /* eat the ] */ |
159 | } else { |
160 | /* |
161 | * Non-special character matches itself. |
162 | */ |
163 | if (*f != *t) |
164 | return 0; |
165 | f++; |
166 | } |
167 | /* |
168 | * Now we've done that, increment t past the character we |
169 | * matched. |
170 | */ |
171 | t++; |
172 | } |
173 | if (!*f || *f == '*') { |
174 | /* |
175 | * We have reached the end of f without finding a mismatch; |
176 | * so we're done. Update the caller pointers and return 1. |
177 | */ |
178 | *fragment = f; |
179 | *target = t; |
180 | return 1; |
181 | } |
182 | /* |
183 | * Otherwise, we must have reached the end of t before we |
184 | * reached the end of f; so we've failed. Return 0. |
185 | */ |
186 | return 0; |
187 | } |
188 | |
189 | /* |
190 | * This is the real wildcard matching routine. It returns 1 for a |
191 | * successful match, 0 for an unsuccessful match, and <0 for a |
192 | * syntax error in the wildcard. |
193 | */ |
194 | int wc_match(const char *wildcard, const char *target) |
195 | { |
196 | int ret; |
197 | |
198 | /* |
199 | * Every time we see a '*' _followed_ by a fragment, we just |
200 | * search along the string for a location at which the fragment |
201 | * matches. The only special case is when we see a fragment |
202 | * right at the start, in which case we just call the matching |
203 | * routine once and give up if it fails. |
204 | */ |
205 | if (*wildcard != '*') { |
206 | ret = wc_match_fragment(&wildcard, &target); |
207 | if (ret <= 0) |
208 | return ret; /* pass back failure or error alike */ |
209 | } |
210 | |
211 | while (*wildcard) { |
212 | assert(*wildcard == '*'); |
213 | while (*wildcard == '*') |
214 | wildcard++; |
215 | |
216 | /* |
217 | * It's possible we've just hit the end of the wildcard |
218 | * after seeing a *, in which case there's no need to |
219 | * bother searching any more because we've won. |
220 | */ |
221 | if (!*wildcard) |
222 | return 1; |
223 | |
224 | /* |
225 | * Now `wildcard' points at the next fragment. So we |
226 | * attempt to match it against `target', and if that fails |
227 | * we increment `target' and try again, and so on. When we |
228 | * find we're about to try matching against the empty |
229 | * string, we give up and return 0. |
230 | */ |
231 | ret = 0; |
232 | while (*target) { |
233 | const char *save_w = wildcard, *save_t = target; |
234 | |
235 | ret = wc_match_fragment(&wildcard, &target); |
236 | |
237 | if (ret < 0) |
238 | return ret; /* syntax error */ |
239 | |
240 | if (ret > 0 && !*wildcard && *target) { |
241 | /* |
242 | * Final special case - literally. |
243 | * |
244 | * This situation arises when we are matching a |
245 | * _terminal_ fragment of the wildcard (that is, |
246 | * there is nothing after it, e.g. "*a"), and it |
247 | * has matched _too early_. For example, matching |
248 | * "*a" against "parka" will match the "a" fragment |
249 | * against the _first_ a, and then (if it weren't |
250 | * for this special case) matching would fail |
251 | * because we're at the end of the wildcard but not |
252 | * at the end of the target string. |
253 | * |
254 | * In this case what we must do is measure the |
255 | * length of the fragment in the target (which is |
256 | * why we saved `target'), jump straight to that |
257 | * distance from the end of the string using |
258 | * strlen, and match the same fragment again there |
259 | * (which is why we saved `wildcard'). Then we |
260 | * return whatever that operation returns. |
261 | */ |
262 | target = save_t + strlen(save_t) - (target - save_t); |
263 | wildcard = save_w; |
264 | return wc_match_fragment(&wildcard, &target); |
265 | } |
266 | |
267 | if (ret > 0) |
268 | break; |
269 | target++; |
270 | } |
271 | if (ret > 0) |
272 | continue; |
273 | return 0; |
274 | } |
275 | |
276 | /* |
277 | * If we reach here, it must be because we successfully matched |
278 | * a fragment and then found ourselves right at the end of the |
279 | * wildcard. Hence, we return 1 if and only if we are also |
280 | * right at the end of the target. |
281 | */ |
282 | return (*target ? 0 : 1); |
283 | } |
284 | |
285 | /* |
286 | * Another utility routine that translates a non-wildcard string |
287 | * into its raw equivalent by removing any escaping backslashes. |
288 | * Expects a target string buffer of anything up to the length of |
289 | * the original wildcard. You can also pass NULL as the output |
290 | * buffer if you're only interested in the return value. |
291 | * |
292 | * Returns 1 on success, or 0 if a wildcard character was |
293 | * encountered. In the latter case the output string MAY not be |
294 | * zero-terminated and you should not use it for anything! |
295 | */ |
296 | int wc_unescape(char *output, const char *wildcard) |
297 | { |
298 | while (*wildcard) { |
299 | if (*wildcard == '\\') { |
300 | wildcard++; |
301 | /* We are lenient about trailing backslashes in non-wildcards. */ |
302 | if (*wildcard) { |
303 | if (output) |
304 | *output++ = *wildcard; |
305 | wildcard++; |
306 | } |
307 | } else if (*wildcard == '*' || *wildcard == '?' || |
308 | *wildcard == '[' || *wildcard == ']') { |
309 | return 0; /* it's a wildcard! */ |
310 | } else { |
311 | if (output) |
312 | *output++ = *wildcard; |
313 | wildcard++; |
314 | } |
315 | } |
316 | *output = '\0'; |
317 | return 1; /* it's clean */ |
318 | } |
319 | |
320 | #ifdef TESTMODE |
321 | |
322 | struct test { |
323 | const char *wildcard; |
324 | const char *target; |
325 | int expected_result; |
326 | }; |
327 | |
328 | const struct test fragment_tests[] = { |
329 | /* |
330 | * We exhaustively unit-test the fragment matching routine |
331 | * itself, which should save us the need to test all its |
332 | * intricacies during the full wildcard tests. |
333 | */ |
334 | {"abc", "abc", 1}, |
335 | {"abc", "abd", 0}, |
336 | {"abc", "abcd", 1}, |
337 | {"abcd", "abc", 0}, |
338 | {"ab[cd]", "abc", 1}, |
339 | {"ab[cd]", "abd", 1}, |
340 | {"ab[cd]", "abe", 0}, |
341 | {"ab[^cd]", "abc", 0}, |
342 | {"ab[^cd]", "abd", 0}, |
343 | {"ab[^cd]", "abe", 1}, |
344 | {"ab\\", "abc", -WC_TRAILINGBACKSLASH}, |
345 | {"ab\\*", "ab*", 1}, |
346 | {"ab\\?", "ab*", 0}, |
347 | {"ab?", "abc", 1}, |
348 | {"ab?", "ab", 0}, |
349 | {"ab[", "abc", -WC_UNCLOSEDCLASS}, |
350 | {"ab[c-", "abb", -WC_UNCLOSEDCLASS}, |
351 | {"ab[c-]", "abb", -WC_INVALIDRANGE}, |
352 | {"ab[c-e]", "abb", 0}, |
353 | {"ab[c-e]", "abc", 1}, |
354 | {"ab[c-e]", "abd", 1}, |
355 | {"ab[c-e]", "abe", 1}, |
356 | {"ab[c-e]", "abf", 0}, |
357 | {"ab[e-c]", "abb", 0}, |
358 | {"ab[e-c]", "abc", 1}, |
359 | {"ab[e-c]", "abd", 1}, |
360 | {"ab[e-c]", "abe", 1}, |
361 | {"ab[e-c]", "abf", 0}, |
362 | {"ab[^c-e]", "abb", 1}, |
363 | {"ab[^c-e]", "abc", 0}, |
364 | {"ab[^c-e]", "abd", 0}, |
365 | {"ab[^c-e]", "abe", 0}, |
366 | {"ab[^c-e]", "abf", 1}, |
367 | {"ab[^e-c]", "abb", 1}, |
368 | {"ab[^e-c]", "abc", 0}, |
369 | {"ab[^e-c]", "abd", 0}, |
370 | {"ab[^e-c]", "abe", 0}, |
371 | {"ab[^e-c]", "abf", 1}, |
372 | {"ab[a^]", "aba", 1}, |
373 | {"ab[a^]", "ab^", 1}, |
374 | {"ab[a^]", "abb", 0}, |
375 | {"ab[^a^]", "aba", 0}, |
376 | {"ab[^a^]", "ab^", 0}, |
377 | {"ab[^a^]", "abb", 1}, |
378 | {"ab[-c]", "ab-", 1}, |
379 | {"ab[-c]", "abc", 1}, |
380 | {"ab[-c]", "abd", 0}, |
381 | {"ab[^-c]", "ab-", 0}, |
382 | {"ab[^-c]", "abc", 0}, |
383 | {"ab[^-c]", "abd", 1}, |
384 | {"ab[\\[-\\]]", "abZ", 0}, |
385 | {"ab[\\[-\\]]", "ab[", 1}, |
386 | {"ab[\\[-\\]]", "ab\\", 1}, |
387 | {"ab[\\[-\\]]", "ab]", 1}, |
388 | {"ab[\\[-\\]]", "ab^", 0}, |
389 | {"ab[^\\[-\\]]", "abZ", 1}, |
390 | {"ab[^\\[-\\]]", "ab[", 0}, |
391 | {"ab[^\\[-\\]]", "ab\\", 0}, |
392 | {"ab[^\\[-\\]]", "ab]", 0}, |
393 | {"ab[^\\[-\\]]", "ab^", 1}, |
394 | {"ab[a-fA-F]", "aba", 1}, |
395 | {"ab[a-fA-F]", "abF", 1}, |
396 | {"ab[a-fA-F]", "abZ", 0}, |
397 | }; |
398 | |
399 | const struct test full_tests[] = { |
400 | {"a", "argh", 0}, |
401 | {"a", "ba", 0}, |
402 | {"a", "a", 1}, |
403 | {"a*", "aardvark", 1}, |
404 | {"a*", "badger", 0}, |
405 | {"*a", "park", 0}, |
406 | {"*a", "pArka", 1}, |
407 | {"*a", "parka", 1}, |
408 | {"*a*", "park", 1}, |
409 | {"*a*", "perk", 0}, |
410 | {"?b*r?", "abracadabra", 1}, |
411 | {"?b*r?", "abracadabr", 0}, |
412 | {"?b*r?", "abracadabzr", 0}, |
413 | }; |
414 | |
415 | int main(void) |
416 | { |
417 | int i; |
418 | int fails, passes; |
419 | |
420 | fails = passes = 0; |
421 | |
422 | for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) { |
423 | const char *f, *t; |
424 | int eret, aret; |
425 | f = fragment_tests[i].wildcard; |
426 | t = fragment_tests[i].target; |
427 | eret = fragment_tests[i].expected_result; |
428 | aret = wc_match_fragment(&f, &t); |
429 | if (aret != eret) { |
430 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", |
431 | fragment_tests[i].wildcard, fragment_tests[i].target, |
432 | aret, eret); |
433 | fails++; |
434 | } else |
435 | passes++; |
436 | } |
437 | |
438 | for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) { |
439 | const char *f, *t; |
440 | int eret, aret; |
441 | f = full_tests[i].wildcard; |
442 | t = full_tests[i].target; |
443 | eret = full_tests[i].expected_result; |
444 | aret = wc_match(f, t); |
445 | if (aret != eret) { |
446 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", |
447 | full_tests[i].wildcard, full_tests[i].target, |
448 | aret, eret); |
449 | fails++; |
450 | } else |
451 | passes++; |
452 | } |
453 | |
454 | printf("passed %d, failed %d\n", passes, fails); |
455 | |
456 | return 0; |
457 | } |
458 | |
459 | #endif |