4eb24e3a |
1 | /* |
2 | * Wildcard matching engine for use with SFTP-based file transfer |
3 | * programs (PSFTP, new-look PSCP): since SFTP has no notion of |
4 | * getting the remote side to do globbing (and rightly so) we have |
5 | * to do it locally, by retrieving all the filenames in a directory |
6 | * and checking each against the wildcard pattern. |
7 | */ |
8 | |
9 | #include <assert.h> |
10 | #include <stdlib.h> |
11 | #include <string.h> |
12 | |
aed97b82 |
13 | #include "putty.h" |
14 | |
4eb24e3a |
15 | /* |
16 | * Definition of wildcard syntax: |
17 | * |
18 | * - * matches any sequence of characters, including zero. |
19 | * - ? matches exactly one character which can be anything. |
20 | * - [abc] matches exactly one character which is a, b or c. |
21 | * - [a-f] matches anything from a through f. |
22 | * - [^a-f] matches anything _except_ a through f. |
23 | * - [-_] matches - or _; [^-_] matches anything else. (The - is |
24 | * non-special if it occurs immediately after the opening |
25 | * bracket or ^.) |
26 | * - [a^] matches an a or a ^. (The ^ is non-special if it does |
27 | * _not_ occur immediately after the opening bracket.) |
28 | * - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \. |
29 | * - All other characters are non-special and match themselves. |
30 | */ |
31 | |
32 | /* |
33 | * The wildcard matching technique we use is very simple and |
34 | * potentially O(N^2) in running time, but I don't anticipate it |
35 | * being that bad in reality (particularly since N will be the size |
36 | * of a filename, which isn't all that much). Perhaps one day, once |
37 | * PuTTY has grown a regexp matcher for some other reason, I might |
38 | * come back and reimplement wildcards by translating them into |
39 | * regexps or directly into NFAs; but for the moment, in the |
40 | * absence of any other need for the NFA->DFA translation engine, |
41 | * anything more than the simplest possible wildcard matcher is |
42 | * vast code-size overkill. |
43 | * |
44 | * Essentially, these wildcards are much simpler than regexps in |
45 | * that they consist of a sequence of rigid fragments (? and [...] |
46 | * can never match more or less than one character) separated by |
47 | * asterisks. It is therefore extremely simple to look at a rigid |
48 | * fragment and determine whether or not it begins at a particular |
49 | * point in the test string; so we can search along the string |
50 | * until we find each fragment, then search for the next. As long |
51 | * as we find each fragment in the _first_ place it occurs, there |
52 | * will never be a danger of having to backpedal and try to find it |
53 | * again somewhere else. |
54 | */ |
55 | |
56 | enum { |
57 | WC_TRAILINGBACKSLASH = 1, |
58 | WC_UNCLOSEDCLASS, |
62cdf1ed |
59 | WC_INVALIDRANGE |
4eb24e3a |
60 | }; |
61 | |
62 | /* |
63 | * Error reporting is done by returning various negative values |
64 | * from the wildcard routines. Passing any such value to wc_error |
65 | * will give a human-readable message. |
66 | */ |
67 | const char *wc_error(int value) |
68 | { |
69 | value = abs(value); |
70 | switch (value) { |
71 | case WC_TRAILINGBACKSLASH: |
72 | return "'\' occurred at end of string (expected another character)"; |
73 | case WC_UNCLOSEDCLASS: |
74 | return "expected ']' to close character class"; |
75 | case WC_INVALIDRANGE: |
76 | return "character range was not terminated (']' just after '-')"; |
77 | } |
78 | return "INTERNAL ERROR: unrecognised wildcard error number"; |
79 | } |
80 | |
81 | /* |
82 | * This is the routine that tests a target string to see if an |
83 | * initial substring of it matches a fragment. If successful, it |
84 | * returns 1, and advances both `fragment' and `target' past the |
85 | * fragment and matching substring respectively. If unsuccessful it |
86 | * returns zero. If the wildcard fragment suffers a syntax error, |
87 | * it returns <0 and the precise value indexes into wc_error. |
88 | */ |
89 | static int wc_match_fragment(const char **fragment, const char **target) |
90 | { |
91 | const char *f, *t; |
92 | |
93 | f = *fragment; |
94 | t = *target; |
95 | /* |
96 | * The fragment terminates at either the end of the string, or |
97 | * the first (unescaped) *. |
98 | */ |
99 | while (*f && *f != '*' && *t) { |
100 | /* |
101 | * Extract one character from t, and one character's worth |
102 | * of pattern from f, and step along both. Return 0 if they |
103 | * fail to match. |
104 | */ |
105 | if (*f == '\\') { |
106 | /* |
107 | * Backslash, which means f[1] is to be treated as a |
108 | * literal character no matter what it is. It may not |
109 | * be the end of the string. |
110 | */ |
111 | if (!f[1]) |
112 | return -WC_TRAILINGBACKSLASH; /* error */ |
113 | if (f[1] != *t) |
114 | return 0; /* failed to match */ |
115 | f += 2; |
116 | } else if (*f == '?') { |
117 | /* |
118 | * Question mark matches anything. |
119 | */ |
120 | f++; |
121 | } else if (*f == '[') { |
122 | int invert = 0; |
123 | int matched = 0; |
124 | /* |
125 | * Open bracket introduces a character class. |
126 | */ |
127 | f++; |
128 | if (*f == '^') { |
129 | invert = 1; |
130 | f++; |
131 | } |
132 | while (*f != ']') { |
133 | if (*f == '\\') |
134 | f++; /* backslashes still work */ |
135 | if (!*f) |
136 | return -WC_UNCLOSEDCLASS; /* error again */ |
137 | if (f[1] == '-') { |
138 | int lower, upper, ourchr; |
139 | lower = (unsigned char) *f++; |
140 | f++; /* eat the minus */ |
141 | if (*f == ']') |
142 | return -WC_INVALIDRANGE; /* different error! */ |
143 | if (*f == '\\') |
144 | f++; /* backslashes _still_ work */ |
145 | if (!*f) |
146 | return -WC_UNCLOSEDCLASS; /* error again */ |
147 | upper = (unsigned char) *f++; |
148 | ourchr = (unsigned char) *t; |
149 | if (lower > upper) { |
150 | int t = lower; lower = upper; upper = t; |
151 | } |
152 | if (ourchr >= lower && ourchr <= upper) |
153 | matched = 1; |
154 | } else { |
155 | matched |= (*t == *f++); |
156 | } |
157 | } |
158 | if (invert == matched) |
159 | return 0; /* failed to match character class */ |
160 | f++; /* eat the ] */ |
161 | } else { |
162 | /* |
163 | * Non-special character matches itself. |
164 | */ |
165 | if (*f != *t) |
166 | return 0; |
167 | f++; |
168 | } |
169 | /* |
170 | * Now we've done that, increment t past the character we |
171 | * matched. |
172 | */ |
173 | t++; |
174 | } |
175 | if (!*f || *f == '*') { |
176 | /* |
177 | * We have reached the end of f without finding a mismatch; |
178 | * so we're done. Update the caller pointers and return 1. |
179 | */ |
180 | *fragment = f; |
181 | *target = t; |
182 | return 1; |
183 | } |
184 | /* |
185 | * Otherwise, we must have reached the end of t before we |
186 | * reached the end of f; so we've failed. Return 0. |
187 | */ |
188 | return 0; |
189 | } |
190 | |
191 | /* |
192 | * This is the real wildcard matching routine. It returns 1 for a |
193 | * successful match, 0 for an unsuccessful match, and <0 for a |
194 | * syntax error in the wildcard. |
195 | */ |
196 | int wc_match(const char *wildcard, const char *target) |
197 | { |
198 | int ret; |
199 | |
200 | /* |
201 | * Every time we see a '*' _followed_ by a fragment, we just |
202 | * search along the string for a location at which the fragment |
203 | * matches. The only special case is when we see a fragment |
204 | * right at the start, in which case we just call the matching |
205 | * routine once and give up if it fails. |
206 | */ |
207 | if (*wildcard != '*') { |
208 | ret = wc_match_fragment(&wildcard, &target); |
209 | if (ret <= 0) |
210 | return ret; /* pass back failure or error alike */ |
211 | } |
212 | |
213 | while (*wildcard) { |
214 | assert(*wildcard == '*'); |
215 | while (*wildcard == '*') |
216 | wildcard++; |
217 | |
218 | /* |
219 | * It's possible we've just hit the end of the wildcard |
220 | * after seeing a *, in which case there's no need to |
221 | * bother searching any more because we've won. |
222 | */ |
223 | if (!*wildcard) |
224 | return 1; |
225 | |
226 | /* |
227 | * Now `wildcard' points at the next fragment. So we |
228 | * attempt to match it against `target', and if that fails |
229 | * we increment `target' and try again, and so on. When we |
230 | * find we're about to try matching against the empty |
231 | * string, we give up and return 0. |
232 | */ |
233 | ret = 0; |
234 | while (*target) { |
235 | const char *save_w = wildcard, *save_t = target; |
236 | |
237 | ret = wc_match_fragment(&wildcard, &target); |
238 | |
239 | if (ret < 0) |
240 | return ret; /* syntax error */ |
241 | |
242 | if (ret > 0 && !*wildcard && *target) { |
243 | /* |
244 | * Final special case - literally. |
245 | * |
246 | * This situation arises when we are matching a |
247 | * _terminal_ fragment of the wildcard (that is, |
248 | * there is nothing after it, e.g. "*a"), and it |
249 | * has matched _too early_. For example, matching |
250 | * "*a" against "parka" will match the "a" fragment |
251 | * against the _first_ a, and then (if it weren't |
252 | * for this special case) matching would fail |
253 | * because we're at the end of the wildcard but not |
254 | * at the end of the target string. |
255 | * |
256 | * In this case what we must do is measure the |
257 | * length of the fragment in the target (which is |
258 | * why we saved `target'), jump straight to that |
259 | * distance from the end of the string using |
260 | * strlen, and match the same fragment again there |
261 | * (which is why we saved `wildcard'). Then we |
262 | * return whatever that operation returns. |
263 | */ |
264 | target = save_t + strlen(save_t) - (target - save_t); |
265 | wildcard = save_w; |
266 | return wc_match_fragment(&wildcard, &target); |
267 | } |
268 | |
269 | if (ret > 0) |
270 | break; |
271 | target++; |
272 | } |
273 | if (ret > 0) |
274 | continue; |
275 | return 0; |
276 | } |
277 | |
278 | /* |
279 | * If we reach here, it must be because we successfully matched |
280 | * a fragment and then found ourselves right at the end of the |
281 | * wildcard. Hence, we return 1 if and only if we are also |
282 | * right at the end of the target. |
283 | */ |
284 | return (*target ? 0 : 1); |
285 | } |
286 | |
287 | /* |
288 | * Another utility routine that translates a non-wildcard string |
289 | * into its raw equivalent by removing any escaping backslashes. |
290 | * Expects a target string buffer of anything up to the length of |
291 | * the original wildcard. You can also pass NULL as the output |
292 | * buffer if you're only interested in the return value. |
293 | * |
294 | * Returns 1 on success, or 0 if a wildcard character was |
295 | * encountered. In the latter case the output string MAY not be |
296 | * zero-terminated and you should not use it for anything! |
297 | */ |
298 | int wc_unescape(char *output, const char *wildcard) |
299 | { |
300 | while (*wildcard) { |
301 | if (*wildcard == '\\') { |
302 | wildcard++; |
303 | /* We are lenient about trailing backslashes in non-wildcards. */ |
304 | if (*wildcard) { |
305 | if (output) |
306 | *output++ = *wildcard; |
307 | wildcard++; |
308 | } |
309 | } else if (*wildcard == '*' || *wildcard == '?' || |
310 | *wildcard == '[' || *wildcard == ']') { |
311 | return 0; /* it's a wildcard! */ |
312 | } else { |
313 | if (output) |
314 | *output++ = *wildcard; |
315 | wildcard++; |
316 | } |
317 | } |
318 | *output = '\0'; |
319 | return 1; /* it's clean */ |
320 | } |
321 | |
322 | #ifdef TESTMODE |
323 | |
324 | struct test { |
325 | const char *wildcard; |
326 | const char *target; |
327 | int expected_result; |
328 | }; |
329 | |
330 | const struct test fragment_tests[] = { |
331 | /* |
332 | * We exhaustively unit-test the fragment matching routine |
333 | * itself, which should save us the need to test all its |
334 | * intricacies during the full wildcard tests. |
335 | */ |
336 | {"abc", "abc", 1}, |
337 | {"abc", "abd", 0}, |
338 | {"abc", "abcd", 1}, |
339 | {"abcd", "abc", 0}, |
340 | {"ab[cd]", "abc", 1}, |
341 | {"ab[cd]", "abd", 1}, |
342 | {"ab[cd]", "abe", 0}, |
343 | {"ab[^cd]", "abc", 0}, |
344 | {"ab[^cd]", "abd", 0}, |
345 | {"ab[^cd]", "abe", 1}, |
346 | {"ab\\", "abc", -WC_TRAILINGBACKSLASH}, |
347 | {"ab\\*", "ab*", 1}, |
348 | {"ab\\?", "ab*", 0}, |
349 | {"ab?", "abc", 1}, |
350 | {"ab?", "ab", 0}, |
351 | {"ab[", "abc", -WC_UNCLOSEDCLASS}, |
352 | {"ab[c-", "abb", -WC_UNCLOSEDCLASS}, |
353 | {"ab[c-]", "abb", -WC_INVALIDRANGE}, |
354 | {"ab[c-e]", "abb", 0}, |
355 | {"ab[c-e]", "abc", 1}, |
356 | {"ab[c-e]", "abd", 1}, |
357 | {"ab[c-e]", "abe", 1}, |
358 | {"ab[c-e]", "abf", 0}, |
359 | {"ab[e-c]", "abb", 0}, |
360 | {"ab[e-c]", "abc", 1}, |
361 | {"ab[e-c]", "abd", 1}, |
362 | {"ab[e-c]", "abe", 1}, |
363 | {"ab[e-c]", "abf", 0}, |
364 | {"ab[^c-e]", "abb", 1}, |
365 | {"ab[^c-e]", "abc", 0}, |
366 | {"ab[^c-e]", "abd", 0}, |
367 | {"ab[^c-e]", "abe", 0}, |
368 | {"ab[^c-e]", "abf", 1}, |
369 | {"ab[^e-c]", "abb", 1}, |
370 | {"ab[^e-c]", "abc", 0}, |
371 | {"ab[^e-c]", "abd", 0}, |
372 | {"ab[^e-c]", "abe", 0}, |
373 | {"ab[^e-c]", "abf", 1}, |
374 | {"ab[a^]", "aba", 1}, |
375 | {"ab[a^]", "ab^", 1}, |
376 | {"ab[a^]", "abb", 0}, |
377 | {"ab[^a^]", "aba", 0}, |
378 | {"ab[^a^]", "ab^", 0}, |
379 | {"ab[^a^]", "abb", 1}, |
380 | {"ab[-c]", "ab-", 1}, |
381 | {"ab[-c]", "abc", 1}, |
382 | {"ab[-c]", "abd", 0}, |
383 | {"ab[^-c]", "ab-", 0}, |
384 | {"ab[^-c]", "abc", 0}, |
385 | {"ab[^-c]", "abd", 1}, |
386 | {"ab[\\[-\\]]", "abZ", 0}, |
387 | {"ab[\\[-\\]]", "ab[", 1}, |
388 | {"ab[\\[-\\]]", "ab\\", 1}, |
389 | {"ab[\\[-\\]]", "ab]", 1}, |
390 | {"ab[\\[-\\]]", "ab^", 0}, |
391 | {"ab[^\\[-\\]]", "abZ", 1}, |
392 | {"ab[^\\[-\\]]", "ab[", 0}, |
393 | {"ab[^\\[-\\]]", "ab\\", 0}, |
394 | {"ab[^\\[-\\]]", "ab]", 0}, |
395 | {"ab[^\\[-\\]]", "ab^", 1}, |
396 | {"ab[a-fA-F]", "aba", 1}, |
397 | {"ab[a-fA-F]", "abF", 1}, |
398 | {"ab[a-fA-F]", "abZ", 0}, |
399 | }; |
400 | |
401 | const struct test full_tests[] = { |
402 | {"a", "argh", 0}, |
403 | {"a", "ba", 0}, |
404 | {"a", "a", 1}, |
405 | {"a*", "aardvark", 1}, |
406 | {"a*", "badger", 0}, |
407 | {"*a", "park", 0}, |
408 | {"*a", "pArka", 1}, |
409 | {"*a", "parka", 1}, |
410 | {"*a*", "park", 1}, |
411 | {"*a*", "perk", 0}, |
412 | {"?b*r?", "abracadabra", 1}, |
413 | {"?b*r?", "abracadabr", 0}, |
414 | {"?b*r?", "abracadabzr", 0}, |
415 | }; |
416 | |
417 | int main(void) |
418 | { |
419 | int i; |
420 | int fails, passes; |
421 | |
422 | fails = passes = 0; |
423 | |
424 | for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) { |
425 | const char *f, *t; |
426 | int eret, aret; |
427 | f = fragment_tests[i].wildcard; |
428 | t = fragment_tests[i].target; |
429 | eret = fragment_tests[i].expected_result; |
430 | aret = wc_match_fragment(&f, &t); |
431 | if (aret != eret) { |
432 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", |
433 | fragment_tests[i].wildcard, fragment_tests[i].target, |
434 | aret, eret); |
435 | fails++; |
436 | } else |
437 | passes++; |
438 | } |
439 | |
440 | for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) { |
441 | const char *f, *t; |
442 | int eret, aret; |
443 | f = full_tests[i].wildcard; |
444 | t = full_tests[i].target; |
445 | eret = full_tests[i].expected_result; |
446 | aret = wc_match(f, t); |
447 | if (aret != eret) { |
448 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", |
449 | full_tests[i].wildcard, full_tests[i].target, |
450 | aret, eret); |
451 | fails++; |
452 | } else |
453 | passes++; |
454 | } |
455 | |
456 | printf("passed %d, failed %d\n", passes, fails); |
457 | |
458 | return 0; |
459 | } |
460 | |
461 | #endif |