From 1d3a7ff684c36162124901504e62a4c6d48ead2f Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 13 Apr 2011 18:23:05 +0000 Subject: [PATCH] Escape dots at the start of pathname components. (Mostly for HTML multifile output mode, but I think it's at worst harmless to leave it enabled in web server contexts too.) Also I've just realised that if you type in a URL and guess wrongly which characters agedu will have quoted, the parse function will work it out anyway and give you a redirect to the canonical version of the path. This was more or less accidental - an unanticipated consequence of my parse-reformat-redirect strategy - but it seems thoroughly useful. Add a comment pretending I meant to do it all along :-) git-svn-id: svn://svn.tartarus.org/sgt/agedu@9153 cda61777-01e9-0310-a592-d414129be87e --- html.c | 36 +++++++++++++++++++++++++++++++++--- html.h | 4 +++- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/html.c b/html.c index 6674b47..f1b4df0 100644 --- a/html.c +++ b/html.c @@ -446,6 +446,7 @@ char *format_string(const char *fmt, unsigned long index, const void *t) char *path = NULL, *q = NULL; char pathsep = trie_pathsep(t); int maxpathlen = trie_maxpathlen(t); + int leading; while (fmt) { struct format_option opt = get_format_option(&fmt); @@ -483,14 +484,20 @@ char *format_string(const char *fmt, unsigned long index, const void *t) trie_getpath(t, index, path); q = path; } + leading = 1; while (*q) { char c = *q++; - if (c == pathsep && opt.translate_pathsep) + if (c == pathsep && opt.translate_pathsep) { *p++ = '/'; - else if (!isalnum((unsigned char)c) && !strchr("-.@_", c)) + leading = 1; + } else if (!isalnum((unsigned char)c) && + ((leading && c=='.') || !strchr("-.@_", c))) { p += sprintf(p, "=%02X", (unsigned char)c); - else + leading = 0; + } else { *p++ = c; + leading = 0; + } } sfree(path); break; @@ -608,6 +615,29 @@ int html_parse_path(const void *t, const char *path, p++; midlen--; } else if (*p == '=') { + /* + * We intentionally do not check whether the + * escaped character _should_ have been escaped + * according to the rules in html_format_path. + * + * All clients of this parsing function, after a + * successful parse, call html_format_path to find + * the canonical URI for the same index and return + * an HTTP redirect if the provided URI was not + * exactly equal to that canonical form. This is + * critical when the correction involves adding or + * removing a trailing slash (because then + * relative hrefs on the generated page can be + * computed with respect to the canonical URI + * instead of having to remember what the actual + * URI was), but also has the useful effect that + * if a user attempts to type in (guess) a URI by + * hand they don't have to remember the escaping + * rules - as long as they type _something_ that + * this code can parse into a recognisable + * pathname, it will be automatically 301ed into + * the canonical form. + */ if (midlen < 3 || !isxdigit((unsigned char)p[1]) || !isxdigit((unsigned char)p[2])) diff --git a/html.h b/html.h index d420036..6615a4a 100644 --- a/html.h +++ b/html.h @@ -29,7 +29,9 @@ struct html_config { * two hex digits (including, in particular, = itself). The * only characters not escaped are the ASCII alphabets and * numbers, the subdirectory separator as mentioned above, - * and the four punctuation characters -.@_ . + * and the four punctuation characters -.@_ (with the + * exception that at the very start of a pathname, even '.' + * is escaped). * - '%/p' outputs the pathname of the tree entry, but this time * the subdirectory separator is also considered to be a * worrying character and is escaped. -- 2.11.0