zlib_disable_compression() and zlib_huflookup() are unused outside this file.
[u/mdw/putty] / wildcard.c
CommitLineData
4eb24e3a 1/*
2 * Wildcard matching engine for use with SFTP-based file transfer
3 * programs (PSFTP, new-look PSCP): since SFTP has no notion of
4 * getting the remote side to do globbing (and rightly so) we have
5 * to do it locally, by retrieving all the filenames in a directory
6 * and checking each against the wildcard pattern.
7 */
8
9#include <assert.h>
10#include <stdlib.h>
11#include <string.h>
12
13/*
14 * Definition of wildcard syntax:
15 *
16 * - * matches any sequence of characters, including zero.
17 * - ? matches exactly one character which can be anything.
18 * - [abc] matches exactly one character which is a, b or c.
19 * - [a-f] matches anything from a through f.
20 * - [^a-f] matches anything _except_ a through f.
21 * - [-_] matches - or _; [^-_] matches anything else. (The - is
22 * non-special if it occurs immediately after the opening
23 * bracket or ^.)
24 * - [a^] matches an a or a ^. (The ^ is non-special if it does
25 * _not_ occur immediately after the opening bracket.)
26 * - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \.
27 * - All other characters are non-special and match themselves.
28 */
29
30/*
31 * The wildcard matching technique we use is very simple and
32 * potentially O(N^2) in running time, but I don't anticipate it
33 * being that bad in reality (particularly since N will be the size
34 * of a filename, which isn't all that much). Perhaps one day, once
35 * PuTTY has grown a regexp matcher for some other reason, I might
36 * come back and reimplement wildcards by translating them into
37 * regexps or directly into NFAs; but for the moment, in the
38 * absence of any other need for the NFA->DFA translation engine,
39 * anything more than the simplest possible wildcard matcher is
40 * vast code-size overkill.
41 *
42 * Essentially, these wildcards are much simpler than regexps in
43 * that they consist of a sequence of rigid fragments (? and [...]
44 * can never match more or less than one character) separated by
45 * asterisks. It is therefore extremely simple to look at a rigid
46 * fragment and determine whether or not it begins at a particular
47 * point in the test string; so we can search along the string
48 * until we find each fragment, then search for the next. As long
49 * as we find each fragment in the _first_ place it occurs, there
50 * will never be a danger of having to backpedal and try to find it
51 * again somewhere else.
52 */
53
54enum {
55 WC_TRAILINGBACKSLASH = 1,
56 WC_UNCLOSEDCLASS,
62cdf1ed 57 WC_INVALIDRANGE
4eb24e3a 58};
59
60/*
61 * Error reporting is done by returning various negative values
62 * from the wildcard routines. Passing any such value to wc_error
63 * will give a human-readable message.
64 */
65const char *wc_error(int value)
66{
67 value = abs(value);
68 switch (value) {
69 case WC_TRAILINGBACKSLASH:
70 return "'\' occurred at end of string (expected another character)";
71 case WC_UNCLOSEDCLASS:
72 return "expected ']' to close character class";
73 case WC_INVALIDRANGE:
74 return "character range was not terminated (']' just after '-')";
75 }
76 return "INTERNAL ERROR: unrecognised wildcard error number";
77}
78
79/*
80 * This is the routine that tests a target string to see if an
81 * initial substring of it matches a fragment. If successful, it
82 * returns 1, and advances both `fragment' and `target' past the
83 * fragment and matching substring respectively. If unsuccessful it
84 * returns zero. If the wildcard fragment suffers a syntax error,
85 * it returns <0 and the precise value indexes into wc_error.
86 */
87static int wc_match_fragment(const char **fragment, const char **target)
88{
89 const char *f, *t;
90
91 f = *fragment;
92 t = *target;
93 /*
94 * The fragment terminates at either the end of the string, or
95 * the first (unescaped) *.
96 */
97 while (*f && *f != '*' && *t) {
98 /*
99 * Extract one character from t, and one character's worth
100 * of pattern from f, and step along both. Return 0 if they
101 * fail to match.
102 */
103 if (*f == '\\') {
104 /*
105 * Backslash, which means f[1] is to be treated as a
106 * literal character no matter what it is. It may not
107 * be the end of the string.
108 */
109 if (!f[1])
110 return -WC_TRAILINGBACKSLASH; /* error */
111 if (f[1] != *t)
112 return 0; /* failed to match */
113 f += 2;
114 } else if (*f == '?') {
115 /*
116 * Question mark matches anything.
117 */
118 f++;
119 } else if (*f == '[') {
120 int invert = 0;
121 int matched = 0;
122 /*
123 * Open bracket introduces a character class.
124 */
125 f++;
126 if (*f == '^') {
127 invert = 1;
128 f++;
129 }
130 while (*f != ']') {
131 if (*f == '\\')
132 f++; /* backslashes still work */
133 if (!*f)
134 return -WC_UNCLOSEDCLASS; /* error again */
135 if (f[1] == '-') {
136 int lower, upper, ourchr;
137 lower = (unsigned char) *f++;
138 f++; /* eat the minus */
139 if (*f == ']')
140 return -WC_INVALIDRANGE; /* different error! */
141 if (*f == '\\')
142 f++; /* backslashes _still_ work */
143 if (!*f)
144 return -WC_UNCLOSEDCLASS; /* error again */
145 upper = (unsigned char) *f++;
146 ourchr = (unsigned char) *t;
147 if (lower > upper) {
148 int t = lower; lower = upper; upper = t;
149 }
150 if (ourchr >= lower && ourchr <= upper)
151 matched = 1;
152 } else {
153 matched |= (*t == *f++);
154 }
155 }
156 if (invert == matched)
157 return 0; /* failed to match character class */
158 f++; /* eat the ] */
159 } else {
160 /*
161 * Non-special character matches itself.
162 */
163 if (*f != *t)
164 return 0;
165 f++;
166 }
167 /*
168 * Now we've done that, increment t past the character we
169 * matched.
170 */
171 t++;
172 }
173 if (!*f || *f == '*') {
174 /*
175 * We have reached the end of f without finding a mismatch;
176 * so we're done. Update the caller pointers and return 1.
177 */
178 *fragment = f;
179 *target = t;
180 return 1;
181 }
182 /*
183 * Otherwise, we must have reached the end of t before we
184 * reached the end of f; so we've failed. Return 0.
185 */
186 return 0;
187}
188
189/*
190 * This is the real wildcard matching routine. It returns 1 for a
191 * successful match, 0 for an unsuccessful match, and <0 for a
192 * syntax error in the wildcard.
193 */
194int wc_match(const char *wildcard, const char *target)
195{
196 int ret;
197
198 /*
199 * Every time we see a '*' _followed_ by a fragment, we just
200 * search along the string for a location at which the fragment
201 * matches. The only special case is when we see a fragment
202 * right at the start, in which case we just call the matching
203 * routine once and give up if it fails.
204 */
205 if (*wildcard != '*') {
206 ret = wc_match_fragment(&wildcard, &target);
207 if (ret <= 0)
208 return ret; /* pass back failure or error alike */
209 }
210
211 while (*wildcard) {
212 assert(*wildcard == '*');
213 while (*wildcard == '*')
214 wildcard++;
215
216 /*
217 * It's possible we've just hit the end of the wildcard
218 * after seeing a *, in which case there's no need to
219 * bother searching any more because we've won.
220 */
221 if (!*wildcard)
222 return 1;
223
224 /*
225 * Now `wildcard' points at the next fragment. So we
226 * attempt to match it against `target', and if that fails
227 * we increment `target' and try again, and so on. When we
228 * find we're about to try matching against the empty
229 * string, we give up and return 0.
230 */
231 ret = 0;
232 while (*target) {
233 const char *save_w = wildcard, *save_t = target;
234
235 ret = wc_match_fragment(&wildcard, &target);
236
237 if (ret < 0)
238 return ret; /* syntax error */
239
240 if (ret > 0 && !*wildcard && *target) {
241 /*
242 * Final special case - literally.
243 *
244 * This situation arises when we are matching a
245 * _terminal_ fragment of the wildcard (that is,
246 * there is nothing after it, e.g. "*a"), and it
247 * has matched _too early_. For example, matching
248 * "*a" against "parka" will match the "a" fragment
249 * against the _first_ a, and then (if it weren't
250 * for this special case) matching would fail
251 * because we're at the end of the wildcard but not
252 * at the end of the target string.
253 *
254 * In this case what we must do is measure the
255 * length of the fragment in the target (which is
256 * why we saved `target'), jump straight to that
257 * distance from the end of the string using
258 * strlen, and match the same fragment again there
259 * (which is why we saved `wildcard'). Then we
260 * return whatever that operation returns.
261 */
262 target = save_t + strlen(save_t) - (target - save_t);
263 wildcard = save_w;
264 return wc_match_fragment(&wildcard, &target);
265 }
266
267 if (ret > 0)
268 break;
269 target++;
270 }
271 if (ret > 0)
272 continue;
273 return 0;
274 }
275
276 /*
277 * If we reach here, it must be because we successfully matched
278 * a fragment and then found ourselves right at the end of the
279 * wildcard. Hence, we return 1 if and only if we are also
280 * right at the end of the target.
281 */
282 return (*target ? 0 : 1);
283}
284
285/*
286 * Another utility routine that translates a non-wildcard string
287 * into its raw equivalent by removing any escaping backslashes.
288 * Expects a target string buffer of anything up to the length of
289 * the original wildcard. You can also pass NULL as the output
290 * buffer if you're only interested in the return value.
291 *
292 * Returns 1 on success, or 0 if a wildcard character was
293 * encountered. In the latter case the output string MAY not be
294 * zero-terminated and you should not use it for anything!
295 */
296int wc_unescape(char *output, const char *wildcard)
297{
298 while (*wildcard) {
299 if (*wildcard == '\\') {
300 wildcard++;
301 /* We are lenient about trailing backslashes in non-wildcards. */
302 if (*wildcard) {
303 if (output)
304 *output++ = *wildcard;
305 wildcard++;
306 }
307 } else if (*wildcard == '*' || *wildcard == '?' ||
308 *wildcard == '[' || *wildcard == ']') {
309 return 0; /* it's a wildcard! */
310 } else {
311 if (output)
312 *output++ = *wildcard;
313 wildcard++;
314 }
315 }
316 *output = '\0';
317 return 1; /* it's clean */
318}
319
320#ifdef TESTMODE
321
322struct test {
323 const char *wildcard;
324 const char *target;
325 int expected_result;
326};
327
328const struct test fragment_tests[] = {
329 /*
330 * We exhaustively unit-test the fragment matching routine
331 * itself, which should save us the need to test all its
332 * intricacies during the full wildcard tests.
333 */
334 {"abc", "abc", 1},
335 {"abc", "abd", 0},
336 {"abc", "abcd", 1},
337 {"abcd", "abc", 0},
338 {"ab[cd]", "abc", 1},
339 {"ab[cd]", "abd", 1},
340 {"ab[cd]", "abe", 0},
341 {"ab[^cd]", "abc", 0},
342 {"ab[^cd]", "abd", 0},
343 {"ab[^cd]", "abe", 1},
344 {"ab\\", "abc", -WC_TRAILINGBACKSLASH},
345 {"ab\\*", "ab*", 1},
346 {"ab\\?", "ab*", 0},
347 {"ab?", "abc", 1},
348 {"ab?", "ab", 0},
349 {"ab[", "abc", -WC_UNCLOSEDCLASS},
350 {"ab[c-", "abb", -WC_UNCLOSEDCLASS},
351 {"ab[c-]", "abb", -WC_INVALIDRANGE},
352 {"ab[c-e]", "abb", 0},
353 {"ab[c-e]", "abc", 1},
354 {"ab[c-e]", "abd", 1},
355 {"ab[c-e]", "abe", 1},
356 {"ab[c-e]", "abf", 0},
357 {"ab[e-c]", "abb", 0},
358 {"ab[e-c]", "abc", 1},
359 {"ab[e-c]", "abd", 1},
360 {"ab[e-c]", "abe", 1},
361 {"ab[e-c]", "abf", 0},
362 {"ab[^c-e]", "abb", 1},
363 {"ab[^c-e]", "abc", 0},
364 {"ab[^c-e]", "abd", 0},
365 {"ab[^c-e]", "abe", 0},
366 {"ab[^c-e]", "abf", 1},
367 {"ab[^e-c]", "abb", 1},
368 {"ab[^e-c]", "abc", 0},
369 {"ab[^e-c]", "abd", 0},
370 {"ab[^e-c]", "abe", 0},
371 {"ab[^e-c]", "abf", 1},
372 {"ab[a^]", "aba", 1},
373 {"ab[a^]", "ab^", 1},
374 {"ab[a^]", "abb", 0},
375 {"ab[^a^]", "aba", 0},
376 {"ab[^a^]", "ab^", 0},
377 {"ab[^a^]", "abb", 1},
378 {"ab[-c]", "ab-", 1},
379 {"ab[-c]", "abc", 1},
380 {"ab[-c]", "abd", 0},
381 {"ab[^-c]", "ab-", 0},
382 {"ab[^-c]", "abc", 0},
383 {"ab[^-c]", "abd", 1},
384 {"ab[\\[-\\]]", "abZ", 0},
385 {"ab[\\[-\\]]", "ab[", 1},
386 {"ab[\\[-\\]]", "ab\\", 1},
387 {"ab[\\[-\\]]", "ab]", 1},
388 {"ab[\\[-\\]]", "ab^", 0},
389 {"ab[^\\[-\\]]", "abZ", 1},
390 {"ab[^\\[-\\]]", "ab[", 0},
391 {"ab[^\\[-\\]]", "ab\\", 0},
392 {"ab[^\\[-\\]]", "ab]", 0},
393 {"ab[^\\[-\\]]", "ab^", 1},
394 {"ab[a-fA-F]", "aba", 1},
395 {"ab[a-fA-F]", "abF", 1},
396 {"ab[a-fA-F]", "abZ", 0},
397};
398
399const struct test full_tests[] = {
400 {"a", "argh", 0},
401 {"a", "ba", 0},
402 {"a", "a", 1},
403 {"a*", "aardvark", 1},
404 {"a*", "badger", 0},
405 {"*a", "park", 0},
406 {"*a", "pArka", 1},
407 {"*a", "parka", 1},
408 {"*a*", "park", 1},
409 {"*a*", "perk", 0},
410 {"?b*r?", "abracadabra", 1},
411 {"?b*r?", "abracadabr", 0},
412 {"?b*r?", "abracadabzr", 0},
413};
414
415int main(void)
416{
417 int i;
418 int fails, passes;
419
420 fails = passes = 0;
421
422 for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) {
423 const char *f, *t;
424 int eret, aret;
425 f = fragment_tests[i].wildcard;
426 t = fragment_tests[i].target;
427 eret = fragment_tests[i].expected_result;
428 aret = wc_match_fragment(&f, &t);
429 if (aret != eret) {
430 printf("failed test: /%s/ against /%s/ returned %d not %d\n",
431 fragment_tests[i].wildcard, fragment_tests[i].target,
432 aret, eret);
433 fails++;
434 } else
435 passes++;
436 }
437
438 for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) {
439 const char *f, *t;
440 int eret, aret;
441 f = full_tests[i].wildcard;
442 t = full_tests[i].target;
443 eret = full_tests[i].expected_result;
444 aret = wc_match(f, t);
445 if (aret != eret) {
446 printf("failed test: /%s/ against /%s/ returned %d not %d\n",
447 full_tests[i].wildcard, full_tests[i].target,
448 aret, eret);
449 fails++;
450 } else
451 passes++;
452 }
453
454 printf("passed %d, failed %d\n", passes, fails);
455
456 return 0;
457}
458
459#endif