git.distorted.org.uk Git - u/mdw/putty/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Wildcard matching engine for use with SFTP-based file transfer
	3	* programs (PSFTP, new-look PSCP): since SFTP has no notion of
	4	* getting the remote side to do globbing (and rightly so) we have
	5	* to do it locally, by retrieving all the filenames in a directory
	6	* and checking each against the wildcard pattern.
	7	*/
	8
	9	#include <assert.h>
	10	#include <stdlib.h>
	11	#include <string.h>
	12
	13	/*
	14	* Definition of wildcard syntax:
	15	*
	16	* - * matches any sequence of characters, including zero.
	17	* - ? matches exactly one character which can be anything.
	18	* - [abc] matches exactly one character which is a, b or c.
	19	* - [a-f] matches anything from a through f.
	20	* - [^a-f] matches anything _except_ a through f.
	21	* - [-_] matches - or _; [^-_] matches anything else. (The - is
	22	* non-special if it occurs immediately after the opening
	23	* bracket or ^.)
	24	* - [a^] matches an a or a ^. (The ^ is non-special if it does
	25	* _not_ occur immediately after the opening bracket.)
	26	* - \, \?, \[, \], \\ match the single characters , ?, [, ], \.
	27	* - All other characters are non-special and match themselves.
	28	*/
	29
	30	/*
	31	* The wildcard matching technique we use is very simple and
	32	* potentially O(N^2) in running time, but I don't anticipate it
	33	* being that bad in reality (particularly since N will be the size
	34	* of a filename, which isn't all that much). Perhaps one day, once
	35	* PuTTY has grown a regexp matcher for some other reason, I might
	36	* come back and reimplement wildcards by translating them into
	37	* regexps or directly into NFAs; but for the moment, in the
	38	* absence of any other need for the NFA->DFA translation engine,
	39	* anything more than the simplest possible wildcard matcher is
	40	* vast code-size overkill.
	41	*
	42	* Essentially, these wildcards are much simpler than regexps in
	43	* that they consist of a sequence of rigid fragments (? and [...]
	44	* can never match more or less than one character) separated by
	45	* asterisks. It is therefore extremely simple to look at a rigid
	46	* fragment and determine whether or not it begins at a particular
	47	* point in the test string; so we can search along the string
	48	* until we find each fragment, then search for the next. As long
	49	* as we find each fragment in the _first_ place it occurs, there
	50	* will never be a danger of having to backpedal and try to find it
	51	* again somewhere else.
	52	*/
	53
	54	enum {
	55	WC_TRAILINGBACKSLASH = 1,
	56	WC_UNCLOSEDCLASS,
	57	WC_INVALIDRANGE,
	58	};
	59
	60	/*
	61	* Error reporting is done by returning various negative values
	62	* from the wildcard routines. Passing any such value to wc_error
	63	* will give a human-readable message.
	64	*/
	65	const char *wc_error(int value)
	66	{
	67	value = abs(value);
	68	switch (value) {
	69	case WC_TRAILINGBACKSLASH:
	70	return "'\' occurred at end of string (expected another character)";
	71	case WC_UNCLOSEDCLASS:
	72	return "expected ']' to close character class";
	73	case WC_INVALIDRANGE:
	74	return "character range was not terminated (']' just after '-')";
	75	}
	76	return "INTERNAL ERROR: unrecognised wildcard error number";
	77	}
	78
	79	/*
	80	* This is the routine that tests a target string to see if an
	81	* initial substring of it matches a fragment. If successful, it
	82	* returns 1, and advances both `fragment' and `target' past the
	83	* fragment and matching substring respectively. If unsuccessful it
	84	* returns zero. If the wildcard fragment suffers a syntax error,
	85	* it returns <0 and the precise value indexes into wc_error.
	86	*/
	87	static int wc_match_fragment(const char fragment, const char target)
	88	{
	89	const char f, t;
	90
	91	f = *fragment;
	92	t = *target;
	93	/*
	94	* The fragment terminates at either the end of the string, or
	95	* the first (unescaped) *.
	96	*/
	97	while (f && f != '' && t) {
	98	/*
	99	* Extract one character from t, and one character's worth
	100	* of pattern from f, and step along both. Return 0 if they
	101	* fail to match.
	102	*/
	103	if (*f == '\\') {
	104	/*
	105	* Backslash, which means f[1] is to be treated as a
	106	* literal character no matter what it is. It may not
	107	* be the end of the string.
	108	*/
	109	if (!f[1])
	110	return -WC_TRAILINGBACKSLASH; /* error */
	111	if (f[1] != *t)
	112	return 0; /* failed to match */
	113	f += 2;
	114	} else if (*f == '?') {
	115	/*
	116	* Question mark matches anything.
	117	*/
	118	f++;
	119	} else if (*f == '[') {
	120	int invert = 0;
	121	int matched = 0;
	122	/*
	123	* Open bracket introduces a character class.
	124	*/
	125	f++;
	126	if (*f == '^') {
	127	invert = 1;
	128	f++;
	129	}
	130	while (*f != ']') {
	131	if (*f == '\\')
	132	f++; /* backslashes still work */
	133	if (!*f)
	134	return -WC_UNCLOSEDCLASS; /* error again */
	135	if (f[1] == '-') {
	136	int lower, upper, ourchr;
	137	lower = (unsigned char) *f++;
	138	f++; /* eat the minus */
	139	if (*f == ']')
	140	return -WC_INVALIDRANGE; /* different error! */
	141	if (*f == '\\')
	142	f++; /* backslashes _still_ work */
	143	if (!*f)
	144	return -WC_UNCLOSEDCLASS; /* error again */
	145	upper = (unsigned char) *f++;
	146	ourchr = (unsigned char) *t;
	147	if (lower > upper) {
	148	int t = lower; lower = upper; upper = t;
	149	}
	150	if (ourchr >= lower && ourchr <= upper)
	151	matched = 1;
	152	} else {
	153	matched \|= (t == f++);
	154	}
	155	}
	156	if (invert == matched)
	157	return 0; /* failed to match character class */
	158	f++; /* eat the ] */
	159	} else {
	160	/*
	161	* Non-special character matches itself.
	162	*/
	163	if (f != t)
	164	return 0;
	165	f++;
	166	}
	167	/*
	168	* Now we've done that, increment t past the character we
	169	* matched.
	170	*/
	171	t++;
	172	}
	173	if (!f \|\| f == '*') {
	174	/*
	175	* We have reached the end of f without finding a mismatch;
	176	* so we're done. Update the caller pointers and return 1.
	177	*/
	178	*fragment = f;
	179	*target = t;
	180	return 1;
	181	}
	182	/*
	183	* Otherwise, we must have reached the end of t before we
	184	* reached the end of f; so we've failed. Return 0.
	185	*/
	186	return 0;
	187	}
	188
	189	/*
	190	* This is the real wildcard matching routine. It returns 1 for a
	191	* successful match, 0 for an unsuccessful match, and <0 for a
	192	* syntax error in the wildcard.
	193	*/
	194	int wc_match(const char wildcard, const char target)
	195	{
	196	int ret;
	197
	198	/*
	199	* Every time we see a '*' _followed_ by a fragment, we just
	200	* search along the string for a location at which the fragment
	201	* matches. The only special case is when we see a fragment
	202	* right at the start, in which case we just call the matching
	203	* routine once and give up if it fails.
	204	*/
	205	if (wildcard != '') {
	206	ret = wc_match_fragment(&wildcard, &target);
	207	if (ret <= 0)
	208	return ret; /* pass back failure or error alike */
	209	}
	210
	211	while (*wildcard) {
	212	assert(wildcard == '');
	213	while (wildcard == '')
	214	wildcard++;
	215
	216	/*
	217	* It's possible we've just hit the end of the wildcard
	218	* after seeing a *, in which case there's no need to
	219	* bother searching any more because we've won.
	220	*/
	221	if (!*wildcard)
	222	return 1;
	223
	224	/*
	225	* Now `wildcard' points at the next fragment. So we
	226	* attempt to match it against `target', and if that fails
	227	* we increment `target' and try again, and so on. When we
	228	* find we're about to try matching against the empty
	229	* string, we give up and return 0.
	230	*/
	231	ret = 0;
	232	while (*target) {
	233	const char save_w = wildcard, save_t = target;
	234
	235	ret = wc_match_fragment(&wildcard, &target);
	236
	237	if (ret < 0)
	238	return ret; /* syntax error */
	239
	240	if (ret > 0 && !wildcard && target) {
	241	/*
	242	* Final special case - literally.
	243	*
	244	* This situation arises when we are matching a
	245	* _terminal_ fragment of the wildcard (that is,
	246	* there is nothing after it, e.g. "*a"), and it
	247	* has matched _too early_. For example, matching
	248	* "*a" against "parka" will match the "a" fragment
	249	* against the _first_ a, and then (if it weren't
	250	* for this special case) matching would fail
	251	* because we're at the end of the wildcard but not
	252	* at the end of the target string.
	253	*
	254	* In this case what we must do is measure the
	255	* length of the fragment in the target (which is
	256	* why we saved `target'), jump straight to that
	257	* distance from the end of the string using
	258	* strlen, and match the same fragment again there
	259	* (which is why we saved `wildcard'). Then we
	260	* return whatever that operation returns.
	261	*/
	262	target = save_t + strlen(save_t) - (target - save_t);
	263	wildcard = save_w;
	264	return wc_match_fragment(&wildcard, &target);
	265	}
	266
	267	if (ret > 0)
	268	break;
	269	target++;
	270	}
	271	if (ret > 0)
	272	continue;
	273	return 0;
	274	}
	275
	276	/*
	277	* If we reach here, it must be because we successfully matched
	278	* a fragment and then found ourselves right at the end of the
	279	* wildcard. Hence, we return 1 if and only if we are also
	280	* right at the end of the target.
	281	*/
	282	return (*target ? 0 : 1);
	283	}
	284
	285	/*
	286	* Another utility routine that translates a non-wildcard string
	287	* into its raw equivalent by removing any escaping backslashes.
	288	* Expects a target string buffer of anything up to the length of
	289	* the original wildcard. You can also pass NULL as the output
	290	* buffer if you're only interested in the return value.
	291	*
	292	* Returns 1 on success, or 0 if a wildcard character was
	293	* encountered. In the latter case the output string MAY not be
	294	* zero-terminated and you should not use it for anything!
	295	*/
	296	int wc_unescape(char output, const char wildcard)
	297	{
	298	while (*wildcard) {
	299	if (*wildcard == '\\') {
	300	wildcard++;
	301	/* We are lenient about trailing backslashes in non-wildcards. */
	302	if (*wildcard) {
	303	if (output)
	304	output++ = wildcard;
	305	wildcard++;
	306	}
	307	} else if (wildcard == '' \|\| *wildcard == '?' \|\|
	308	wildcard == '[' \|\| wildcard == ']') {
	309	return 0; /* it's a wildcard! */
	310	} else {
	311	if (output)
	312	output++ = wildcard;
	313	wildcard++;
	314	}
	315	}
	316	*output = '\0';
	317	return 1; /* it's clean */
	318	}
	319
	320	#ifdef TESTMODE
	321
	322	struct test {
	323	const char *wildcard;
	324	const char *target;
	325	int expected_result;
	326	};
	327
	328	const struct test fragment_tests[] = {
	329	/*
	330	* We exhaustively unit-test the fragment matching routine
	331	* itself, which should save us the need to test all its
	332	* intricacies during the full wildcard tests.
	333	*/
	334	{"abc", "abc", 1},
	335	{"abc", "abd", 0},
	336	{"abc", "abcd", 1},
	337	{"abcd", "abc", 0},
	338	{"ab[cd]", "abc", 1},
	339	{"ab[cd]", "abd", 1},
	340	{"ab[cd]", "abe", 0},
	341	{"ab[^cd]", "abc", 0},
	342	{"ab[^cd]", "abd", 0},
	343	{"ab[^cd]", "abe", 1},
	344	{"ab\\", "abc", -WC_TRAILINGBACKSLASH},
	345	{"ab\\", "ab", 1},
	346	{"ab\\?", "ab*", 0},
	347	{"ab?", "abc", 1},
	348	{"ab?", "ab", 0},
	349	{"ab[", "abc", -WC_UNCLOSEDCLASS},
	350	{"ab[c-", "abb", -WC_UNCLOSEDCLASS},
	351	{"ab[c-]", "abb", -WC_INVALIDRANGE},
	352	{"ab[c-e]", "abb", 0},
	353	{"ab[c-e]", "abc", 1},
	354	{"ab[c-e]", "abd", 1},
	355	{"ab[c-e]", "abe", 1},
	356	{"ab[c-e]", "abf", 0},
	357	{"ab[e-c]", "abb", 0},
	358	{"ab[e-c]", "abc", 1},
	359	{"ab[e-c]", "abd", 1},
	360	{"ab[e-c]", "abe", 1},
	361	{"ab[e-c]", "abf", 0},
	362	{"ab[^c-e]", "abb", 1},
	363	{"ab[^c-e]", "abc", 0},
	364	{"ab[^c-e]", "abd", 0},
	365	{"ab[^c-e]", "abe", 0},
	366	{"ab[^c-e]", "abf", 1},
	367	{"ab[^e-c]", "abb", 1},
	368	{"ab[^e-c]", "abc", 0},
	369	{"ab[^e-c]", "abd", 0},
	370	{"ab[^e-c]", "abe", 0},
	371	{"ab[^e-c]", "abf", 1},
	372	{"ab[a^]", "aba", 1},
	373	{"ab[a^]", "ab^", 1},
	374	{"ab[a^]", "abb", 0},
	375	{"ab[^a^]", "aba", 0},
	376	{"ab[^a^]", "ab^", 0},
	377	{"ab[^a^]", "abb", 1},
	378	{"ab[-c]", "ab-", 1},
	379	{"ab[-c]", "abc", 1},
	380	{"ab[-c]", "abd", 0},
	381	{"ab[^-c]", "ab-", 0},
	382	{"ab[^-c]", "abc", 0},
	383	{"ab[^-c]", "abd", 1},
	384	{"ab[\\[-\\]]", "abZ", 0},
	385	{"ab[\\[-\\]]", "ab[", 1},
	386	{"ab[\\[-\\]]", "ab\\", 1},
	387	{"ab[\\[-\\]]", "ab]", 1},
	388	{"ab[\\[-\\]]", "ab^", 0},
	389	{"ab[^\\[-\\]]", "abZ", 1},
	390	{"ab[^\\[-\\]]", "ab[", 0},
	391	{"ab[^\\[-\\]]", "ab\\", 0},
	392	{"ab[^\\[-\\]]", "ab]", 0},
	393	{"ab[^\\[-\\]]", "ab^", 1},
	394	{"ab[a-fA-F]", "aba", 1},
	395	{"ab[a-fA-F]", "abF", 1},
	396	{"ab[a-fA-F]", "abZ", 0},
	397	};
	398
	399	const struct test full_tests[] = {
	400	{"a", "argh", 0},
	401	{"a", "ba", 0},
	402	{"a", "a", 1},
	403	{"a*", "aardvark", 1},
	404	{"a*", "badger", 0},
	405	{"*a", "park", 0},
	406	{"*a", "pArka", 1},
	407	{"*a", "parka", 1},
	408	{"a", "park", 1},
	409	{"a", "perk", 0},
	410	{"?b*r?", "abracadabra", 1},
	411	{"?b*r?", "abracadabr", 0},
	412	{"?b*r?", "abracadabzr", 0},
	413	};
	414
	415	int main(void)
	416	{
	417	int i;
	418	int fails, passes;
	419
	420	fails = passes = 0;
	421
	422	for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) {
	423	const char f, t;
	424	int eret, aret;
	425	f = fragment_tests[i].wildcard;
	426	t = fragment_tests[i].target;
	427	eret = fragment_tests[i].expected_result;
	428	aret = wc_match_fragment(&f, &t);
	429	if (aret != eret) {
	430	printf("failed test: /%s/ against /%s/ returned %d not %d\n",
	431	fragment_tests[i].wildcard, fragment_tests[i].target,
	432	aret, eret);
	433	fails++;
	434	} else
	435	passes++;
	436	}
	437
	438	for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) {
	439	const char f, t;
	440	int eret, aret;
	441	f = full_tests[i].wildcard;
	442	t = full_tests[i].target;
	443	eret = full_tests[i].expected_result;
	444	aret = wc_match(f, t);
	445	if (aret != eret) {
	446	printf("failed test: /%s/ against /%s/ returned %d not %d\n",
	447	full_tests[i].wildcard, full_tests[i].target,
	448	aret, eret);
	449	fails++;
	450	} else
	451	passes++;
	452	}
	453
	454	printf("passed %d, failed %d\n", passes, fails);
	455
	456	return 0;
	457	}
	458
	459	#endif