Switch all the HTML-based reporting modes (the internal httpd, the CGI
authorsimon <simon@cda61777-01e9-0310-a592-d414129be87e>
Sat, 9 Apr 2011 21:01:01 +0000 (21:01 +0000)
committersimon <simon@cda61777-01e9-0310-a592-d414129be87e>
Sat, 9 Apr 2011 21:01:01 +0000 (21:01 +0000)
mode and the dump of static HTML files) to using URIs and filenames
based on the text of the pathname being reported on, rather than its
numeric index in the data file. The aim is that sub-URIs should remain
valid when the data is updated - if, for instance, you're running the
agedu CGI script permanently and changing the data file under it every
so often.

There's code in there to support the old numeric index behaviour, but
currently no UI to enable it. There could be easily enough, though.

git-svn-id: svn://svn.tartarus.org/sgt/agedu@9150 cda61777-01e9-0310-a592-d414129be87e

TODO
agedu.c
html.c
html.h
httpd.c

diff --git a/TODO b/TODO
index df84d4a..4220a4d 100644 (file)
--- a/TODO
+++ b/TODO
@@ -8,12 +8,6 @@ TODO list for agedu
    enable other modes of use like the built-in --cgi mode, without
    me having to anticipate them in detail.)
 
- - add the option (and perhaps even make it default) to address HTML
-   subpages by pathname rather than by index number. More than just
-   cosmetic: it means that in a scenario where agedu --cgi is always
-   running but the index file is updated by cron, subsidiary
-   pathnames will remain valid across a change.
-
  - we could still be using more of the information coming from
    autoconf. Our config.h is defining a whole bunch of HAVE_FOOs for
    particular functions (e.g. HAVE_INET_NTOA, HAVE_MEMCHR,
diff --git a/agedu.c b/agedu.c
index afaa556..fd8aaea 100644 (file)
--- a/agedu.c
+++ b/agedu.c
@@ -1354,6 +1354,28 @@ int main(int argc, char **argv)
            maxpathlen = trie_maxpathlen(mappedfile);
            pathbuf = snewn(maxpathlen, char);
 
+           if (!querydir || !gotdepth) {
+               /*
+                * Single output file.
+                */
+               if (!querydir) {
+                    cfg.uriformat = "/%|/%p/%|%|/%p";
+               } else {
+                   cfg.uriformat = NULL;
+               }
+               cfg.autoage = htmlautoagerange;
+               cfg.oldest = htmloldest;
+               cfg.newest = htmlnewest;
+               cfg.showfiles = showfiles;
+           } else {
+                cfg.uriformat = "/index.html%|/%/p.html";
+                cfg.fileformat = "/index.html%|/%/p.html";
+               cfg.autoage = htmlautoagerange;
+               cfg.oldest = htmloldest;
+               cfg.newest = htmlnewest;
+               cfg.showfiles = showfiles;
+           }
+
            if (!querydir) {
                /*
                 * If we're run in --cgi mode, read PATH_INFO to get
@@ -1364,13 +1386,27 @@ int main(int argc, char **argv)
                if (!path_info)
                    path_info = "";
 
+                /*
+                 * Parse the path.
+                 */
+                if (!html_parse_path(mappedfile, path_info, &cfg, &xi)) {
+                   printf("Status: 404\nContent-type: text/html\n\n"
+                          "<html><head>"
+                          "<title>404 Not Found</title>"
+                          "</head><body>"
+                          "<h1>400 Not Found</h1>"
+                          "<p>Invalid <code>agedu</code> pathname."
+                          "</body></html>\n");
+                   return 0;
+               }
+
                /*
-                * Because we need relative links to go to the
-                * right place, it's important that our
-                * PATH_INFO should contain a slash right at the
-                * start, and no slashes anywhere else.
+                * If the path was parseable but not canonically
+                * expressed, return a redirect to the canonical
+                * version.
                 */
-               if (path_info[0] != '/') {
+                char *canonpath = html_format_path(mappedfile, &cfg, xi);
+               if (strcmp(canonpath, path_info)) {
                    char *servername = getenv("SERVER_NAME");
                    char *scriptname = getenv("SCRIPT_NAME");
                    if (!servername || !scriptname) {
@@ -1392,7 +1428,7 @@ int main(int argc, char **argv)
                        return 0;
                    }
                    printf("Status: 301\n"
-                          "Location: http://%s/%s/\n"
+                          "Location: http://%s/%s%s\n"
                           "Content-type: text/html\n\n"
                           "<html><head>"
                           "<title>301 Moved</title>"
@@ -1400,39 +1436,10 @@ int main(int argc, char **argv)
                           "<h1>301 Moved</h1>"
                           "<p>Moved."
                           "</body></html>\n",
-                          servername, scriptname);
-                   return 0;
-               } else if (strchr(path_info+1, '/')) {
-                   printf("Status: 404\nContent-type: text/html\n\n"
-                          "<html><head>"
-                          "<title>404 Not Found</title>"
-                          "</head><body>"
-                          "<h1>400 Not Found</h1>"
-                          "<p>Invalid <code>agedu</code> pathname."
-                          "</body></html>\n");
+                          servername, scriptname, canonpath);
                    return 0;
                }
-               xi = atoi(path_info + 1);
 
-               if (xi >= trie_count(mappedfile)) {
-                   printf("Status: 404\nContent-type: text/html\n\n"
-                          "<html><head>"
-                          "<title>404 Not Found</title>"
-                          "</head><body>"
-                          "<h1>404 Not Found</h1>"
-                          "<p>This is not a valid pathname index."
-                          "</body></html>\n");
-                   return 0;
-               } else if (!index_has_root(mappedfile, xi)) {
-                   printf("Status: 404\nContent-type: text/html\n\n"
-                          "<html><head>"
-                          "<title>404 Not Found</title>"
-                          "</head><body>"
-                          "<h1>404 Not Found</h1>"
-                          "<p>Pathname index out of range."
-                          "</body></html>\n");
-                   return 0;
-               }
            } else {
                /*
                 * In ordinary --html mode, process a query
@@ -1468,16 +1475,6 @@ int main(int argc, char **argv)
                /*
                 * Single output file.
                 */
-               if (!querydir) {
-                   cfg.format = "%.0lu";  /* use crosslinks in --cgi mode */
-               } else {
-                   cfg.format = NULL;
-               }
-               cfg.rootpage = NULL;
-               cfg.autoage = htmlautoagerange;
-               cfg.oldest = htmloldest;
-               cfg.newest = htmlnewest;
-               cfg.showfiles = showfiles;
                html = html_query(mappedfile, xi, &cfg, 1);
                if (querydir && outfile != NULL) {
                    FILE *fp = fopen(outfile, "w");
@@ -1527,12 +1524,6 @@ int main(int argc, char **argv)
                make_successor(pathbuf);
                xi2 = trie_before(mappedfile, pathbuf);
 
-               cfg.format = "%lu.html";
-               cfg.rootpage = "index.html";
-               cfg.autoage = htmlautoagerange;
-               cfg.oldest = htmloldest;
-               cfg.newest = htmlnewest;
-               cfg.showfiles = showfiles;
                if (html_dump(mappedfile, xi, xi2, depth, &cfg, prefix))
                    return 1;
            }
@@ -1597,8 +1588,7 @@ int main(int argc, char **argv)
            dcfg.port = httpserverport;
            dcfg.closeoneof = closeoneof;
            dcfg.basicauthdata = httpauthdata;
-           pcfg.format = NULL;
-           pcfg.rootpage = NULL;
+           pcfg.uriformat = "/%|/%p/%|%|/%p";
            pcfg.autoage = htmlautoagerange;
            pcfg.oldest = htmloldest;
            pcfg.newest = htmlnewest;
diff --git a/html.c b/html.c
index d316d1e..6674b47 100644 (file)
--- a/html.c
+++ b/html.c
@@ -16,9 +16,9 @@ struct html {
     const void *t;
     unsigned long long totalsize, oldest, newest;
     char *path2;
-    char *href;
+    char *oururi;
     size_t hreflen;
-    const char *format, *rootpage;
+    const char *uriformat;
     unsigned long long thresholds[MAXCOLOUR];
     char *titletexts[MAXCOLOUR+1];
     time_t now;
@@ -339,14 +339,384 @@ static void compute_display_size(unsigned long long size,
     *fmt = fmts[shift];
 }
 
-static void make_filename(char *buf, size_t buflen,
-                         const char *format, const char *rootpage,
-                         unsigned long index)
+struct format_option {
+    const char *prefix, *suffix;       /* may include '%%' */
+    int prefixlen, suffixlen;          /* does not count '%%' */
+    char fmttype;                      /* 0 for none, or 'n' or 'p' */
+    int translate_pathsep;             /* pathsep rendered as '/'? */
+    int shorten_path;                  /* omit common prefix? */
+};
+
+/*
+ * Gets the next format option from a format string. Advances '*fmt'
+ * past it, or sets it to NULL if nothing is left.
+ */
+struct format_option get_format_option(const char **fmt)
+{
+    struct format_option ret;
+
+    /*
+     * Scan for prefix of format.
+     */
+    ret.prefix = *fmt;
+    ret.prefixlen = 0;
+    while (1) {
+        if (**fmt == '\0') {
+            /*
+             * No formatting directive, and this is the last option.
+             */
+            ret.suffix = *fmt;
+            ret.suffixlen = 0;
+            ret.fmttype = '\0';
+            *fmt = NULL;
+            return ret;
+        } else if (**fmt == '%') {
+            if ((*fmt)[1] == '%') {
+                (*fmt) += 2;           /* just advance one extra */
+                ret.prefixlen++;
+            } else if ((*fmt)[1] == '|') {
+                /*
+                 * No formatting directive.
+                 */
+                ret.suffix = *fmt;
+                ret.suffixlen = 0;
+                ret.fmttype = '\0';
+                (*fmt) += 2;           /* advance to start of next option */
+                return ret;
+            } else {
+                break;
+            }
+        } else {
+            (*fmt)++;                  /* normal character */
+            ret.prefixlen++;
+        }
+    }
+
+    /*
+     * Interpret formatting directive with flags.
+     */
+    (*fmt)++;
+    ret.translate_pathsep = ret.shorten_path = 1;
+    while (1) {
+        char c = *(*fmt)++;
+        assert(c);
+        if (c == '/') {
+            ret.translate_pathsep = 0;
+        } else if (c == '-') {
+            ret.shorten_path = 0;
+        } else {
+            assert(c == 'n' || c == 'p');
+            ret.fmttype = c;
+            break;
+        }
+    }
+
+    /*
+     * Scan for suffix.
+     */
+    ret.suffix = *fmt;
+    ret.suffixlen = 0;
+    while (1) {
+        if (**fmt == '\0') {
+            /*
+             * This is the last option.
+             */
+            *fmt = NULL;
+            return ret;
+        } else if (**fmt != '%') {
+            (*fmt)++;                  /* normal character */
+            ret.suffixlen++;
+        } else {
+            if ((*fmt)[1] == '%') {
+                (*fmt) += 2;           /* just advance one extra */
+                ret.suffixlen++;
+            } else {
+                assert((*fmt)[1] == '|');
+                (*fmt) += 2;           /* advance to start of next option */
+                return ret;
+            }
+        }
+    }
+}
+
+char *format_string(const char *fmt, unsigned long index, const void *t)
 {
-    if (index == 0 && rootpage)
-       snprintf(buf, buflen, "%s", rootpage);
-    else
-       snprintf(buf, buflen, format, index);
+    int maxlen;
+    char *ret = NULL, *p = NULL;
+    char *path = NULL, *q = NULL;
+    char pathsep = trie_pathsep(t);
+    int maxpathlen = trie_maxpathlen(t);
+
+    while (fmt) {
+        struct format_option opt = get_format_option(&fmt);
+        if (index && !opt.fmttype)
+            continue; /* option is only good for the root, which this isn't */
+
+        maxlen = opt.prefixlen + opt.suffixlen + 1;
+        switch (opt.fmttype) {
+          case 'n':
+            maxlen += 40;              /* generous length for an integer */
+            break;
+          case 'p':
+            maxlen += 3*maxpathlen;    /* might have to escape everything */
+            break;
+        }
+        ret = snewn(maxlen, char);
+        p = ret;
+        while (opt.prefixlen-- > 0) {
+            if ((*p++ = *opt.prefix++) == '%')
+                opt.prefix++;
+        }
+        switch (opt.fmttype) {
+          case 'n':
+            p += sprintf(p, "%lu", index);
+            break;
+          case 'p':
+            path = snewn(1+trie_maxpathlen(t), char);
+            if (opt.shorten_path) {
+                trie_getpath(t, 0, path);
+                q = path + strlen(path);
+                trie_getpath(t, index, path);
+                if (*q == pathsep)
+                    q++;
+            } else {
+                trie_getpath(t, index, path);
+                q = path;
+            }
+            while (*q) {
+                char c = *q++;
+                if (c == pathsep && opt.translate_pathsep)
+                    *p++ = '/';
+                else if (!isalnum((unsigned char)c) && !strchr("-.@_", c))
+                    p += sprintf(p, "=%02X", (unsigned char)c);
+                else
+                    *p++ = c;
+            }
+            sfree(path);
+            break;
+        }
+        while (opt.suffixlen-- > 0) {
+            if ((*p++ = *opt.suffix++) == '%')
+                opt.suffix++;
+        }
+        *p = '\0';
+        assert(p - ret < maxlen);
+        return ret;
+    }
+    assert(!"Getting here implies an incomplete set of formats");
+}
+
+char *html_format_path(const void *t, const struct html_config *cfg,
+                       unsigned long index)
+{
+    return format_string(cfg->uriformat, index, t);
+}
+
+int html_parse_path(const void *t, const char *path,
+                    const struct html_config *cfg, unsigned long *index)
+{
+    int len = strlen(path);
+    int midlen;
+    const char *p, *q;
+    char *r;
+    char pathsep = trie_pathsep(t);
+    const char *fmt = cfg->uriformat;
+
+    while (fmt) {
+        struct format_option opt = get_format_option(&fmt);
+
+        /*
+         * Check prefix and suffix.
+         */
+        midlen = len - opt.prefixlen - opt.suffixlen;
+        if (midlen < 0)
+            continue;                  /* prefix and suffix don't even fit */
+
+        p = path;
+        while (opt.prefixlen > 0) {
+            char c = *opt.prefix++;
+            if (c == '%')
+                opt.prefix++;
+            if (*p != c)
+                break;
+            p++;
+            opt.prefixlen--;
+        }
+        if (opt.prefixlen > 0)
+            continue;                  /* prefix didn't match */
+
+        q = path + len - opt.suffixlen;
+        while (opt.suffixlen > 0) {
+            char c = *opt.suffix++;
+            if (c == '%')
+                opt.suffix++;
+            if (*q != c)
+                break;
+            q++;
+            opt.suffixlen--;
+        }
+        if (opt.suffixlen > 0)
+            continue;                  /* suffix didn't match */
+
+        /*
+         * Check the data in between. p points at it, and it's midlen
+         * characters long.
+         */
+        if (opt.fmttype == '\0') {
+            if (midlen == 0) {
+                /*
+                 * Successful match against a root format.
+                 */
+                *index = 0;
+                return 1;
+            }
+        } else if (opt.fmttype == 'n') {
+            *index = 0;
+            while (midlen > 0) {
+                if (*p >= '0' && *p <= '9')
+                    *index = *index * 10 + (*p - '0');
+                else
+                    break;
+                midlen--;
+                p++;
+            }
+            if (midlen == 0) {
+                /*
+                 * Successful match against a numeric format.
+                 */
+                return 1;
+            }
+        } else {
+            assert(opt.fmttype == 'p');
+
+            int maxoutlen = trie_maxpathlen(t) + 1;
+            int maxinlen = midlen + 1;
+            char triepath[maxinlen+maxoutlen];
+
+            if (opt.shorten_path) {
+                trie_getpath(t, 0, triepath);
+                r = triepath + strlen(triepath);
+                if (r > triepath && r[-1] != pathsep)
+                    *r++ = pathsep;
+            } else {
+                r = triepath;
+            }
+
+            while (midlen > 0) {
+                if (*p == '/' && opt.translate_pathsep) {
+                    *r++ = pathsep;
+                    p++;
+                    midlen--;
+                } else if (*p == '=') {
+                    if (midlen < 3 ||
+                        !isxdigit((unsigned char)p[1]) ||
+                        !isxdigit((unsigned char)p[2]))
+                        break;         /* faulty escape encoding */
+                    char x[3];
+                    unsigned cval;
+                    x[0] = p[1];
+                    x[1] = p[2];
+                    x[2] = '\0';
+                    sscanf(x, "%x", &cval);
+                    *r++ = cval;
+                    p += 3;
+                    midlen -= 3;
+                } else {
+                    *r++ = *p;
+                    p++;
+                    midlen--;
+                }
+            }
+            if (midlen > 0)
+                continue;      /* something went wrong in that loop */
+            assert(r - triepath < maxinlen+maxoutlen);
+            *r = '\0';
+
+            unsigned long gotidx = trie_before(t, triepath);
+            if (gotidx >= trie_count(t))
+                continue;              /* index out of range */
+            char retpath[1+maxoutlen];
+            trie_getpath(t, gotidx, retpath);
+            if (strcmp(triepath, retpath))
+                continue;           /* exact path not found in trie */
+            if (!index_has_root(t, gotidx))
+                continue;              /* path is not a directory */
+
+            /*
+             * Successful path-based match.
+             */
+            *index = gotidx;
+            return 1;
+        }
+    }
+
+    return 0;                    /* no match from any format option */
+}
+
+char *make_href(const char *source, const char *target)
+{
+    /*
+     * We insist that both source and target URIs start with a /, or
+     * else we won't be reliably able to construct relative hrefs
+     * between them (e.g. because we've got a suffix on the end of
+     * some CGI pathname that this function doesn't know the final
+     * component of).
+     */
+    assert(*source == '/');
+    assert(*target == '/');
+
+    /*
+     * Find the last / in source. Everything up to but not including
+     * that is the directory to which the output href will be
+     * relative. We enforce by assertion that there must be a /
+     * somewhere in source, or else we can't construct a relative href
+     * at all
+     */
+    const char *sourceend = strrchr(source, '/');
+    assert(sourceend != NULL);
+
+    /*
+     * See how far the target URI agrees with the source one, up to
+     * and including that /.
+     */
+    const char *s = source, *t = target;
+    while (s <= sourceend && *s == *t)
+        s++, t++;
+
+    /*
+     * We're only interested in agreement of complete path components,
+     * so back off until we're sitting just after a shared /.
+     */
+    while (s > source && s[-1] != '/')
+        s--, t--;
+    assert(s > source);
+
+    /*
+     * Now we need some number of levels of "../" to get from source
+     * to here, and then we just replicate the rest of 'target'.
+     */
+    int levels = 0;
+    while (s <= sourceend) {
+        if (*s == '/')
+            levels++;
+        s++;
+    }
+    int len = 3*levels + strlen(t);
+    if (len == 0) {
+        /* One last special case: if target has no tail _and_ we
+         * haven't written out any "../". */
+        return dupstr("./");
+    } else {
+        char *ret = snewn(len+1, char);
+        char *p = ret;
+        while (levels-- > 0) {
+            *p++ = '.';
+            *p++ = '.';
+            *p++ = '/';
+        }
+        strcpy(p, t);
+        return ret;
+    }
 }
 
 #define PIXEL_SIZE 600                /* FIXME: configurability? */
@@ -413,11 +783,13 @@ static void write_report_line(struct html *ctx, struct vector *vec)
     if (vec->name) {
        int doing_href = 0;
 
-       if (ctx->format && vec->want_href) {
-           make_filename(ctx->href, ctx->hreflen,
-                         ctx->format, ctx->rootpage,
-                         vec->index);
-           htprintf(ctx, "<a href=\"%s\">", ctx->href);
+       if (ctx->uriformat && vec->want_href) {
+           char *targeturi = format_string(ctx->uriformat, vec->index,
+                                            ctx->t);
+            char *link = make_href(ctx->oururi, targeturi);
+           htprintf(ctx, "<a href=\"%s\">", link);
+            sfree(link);
+            sfree(targeturi);
            doing_href = 1;
        }
        if (vec->literal)
@@ -447,9 +819,9 @@ char *html_query(const void *t, unsigned long index,
                 const struct html_config *cfg, int downlink)
 {
     struct html actx, *ctx = &actx;
-    char *path, *path2, *p, *q, *href;
+    char *path, *path2, *p, *q;
     char agebuf1[80], agebuf2[80];
-    size_t pathlen, subdirpos, hreflen;
+    size_t pathlen, subdirpos;
     unsigned long index2;
     int i;
     struct vector **vecs;
@@ -462,21 +834,12 @@ char *html_query(const void *t, unsigned long index,
     ctx->buf = NULL;
     ctx->buflen = ctx->bufsize = 0;
     ctx->t = t;
-    ctx->format = cfg->format;
-    ctx->rootpage = cfg->rootpage;
+    ctx->uriformat = cfg->uriformat;
     htprintf(ctx, "<html>\n");
 
     path = snewn(1+trie_maxpathlen(t), char);
     ctx->path2 = path2 = snewn(1+trie_maxpathlen(t), char);
-    if (cfg->format) {
-       hreflen = strlen(cfg->format) + 100;
-       href = snewn(hreflen, char);
-    } else {
-       hreflen = 0;
-       href = NULL;
-    }
-    ctx->hreflen = hreflen;
-    ctx->href = href;
+    ctx->oururi = format_string(cfg->uriformat, index, t);
 
     /*
      * HEAD section.
@@ -519,11 +882,12 @@ char *html_query(const void *t, unsigned long index,
        *zp = '\0';
        index2 = trie_before(t, path);
        trie_getpath(t, index2, path2);
-       if (!strcmptrailingpathsep(path, path2) && cfg->format) {
-           make_filename(href, hreflen, cfg->format, cfg->rootpage, index2);
-           if (!*href)                /* special case that we understand */
-               strcpy(href, "./");
-           htprintf(ctx, "<a href=\"%s\">", href);
+       if (!strcmptrailingpathsep(path, path2) && cfg->uriformat) {
+           char *targeturi = format_string(cfg->uriformat, index2, t);
+            char *link = make_href(ctx->oururi, targeturi);
+           htprintf(ctx, "<a href=\"%s\">", link);
+            sfree(link);
+            sfree(targeturi);
            doing_href = 1;
        }
        *zp = c;
@@ -651,7 +1015,7 @@ char *html_query(const void *t, unsigned long index,
      */
     htprintf(ctx, "</body>\n");
     htprintf(ctx, "</html>\n");
-    sfree(href);
+    sfree(ctx->oururi);
     sfree(path2);
     sfree(path);
     for (i = 0; i < nvecs; i++) {
@@ -670,13 +1034,10 @@ int html_dump(const void *t, unsigned long index, unsigned long endindex,
     /*
      * Determine the filename for this file.
      */
-    assert(cfg->format != NULL);
-    int prefixlen = strlen(pathprefix);
-    int fnmax = strlen(pathprefix) + strlen(cfg->format) + 100;
-    char filename[fnmax];
-    strcpy(filename, pathprefix);
-    make_filename(filename + prefixlen, fnmax - prefixlen,
-                 cfg->format, cfg->rootpage, index);
+    assert(cfg->fileformat != NULL);
+    char *filename = format_string(cfg->fileformat, index, t);
+    char *path = dupfmt("%s%s", pathprefix, filename);
+    sfree(filename);
 
     /*
      * Create the HTML itself. Don't write out downlinks from our
@@ -687,23 +1048,21 @@ int html_dump(const void *t, unsigned long index, unsigned long endindex,
     /*
      * Write it out.
      */
-    FILE *fp = fopen(filename, "w");
+    FILE *fp = fopen(path, "w");
     if (!fp) {
-       fprintf(stderr, "%s: %s: open: %s\n", PNAME,
-               filename, strerror(errno));
+       fprintf(stderr, "%s: %s: open: %s\n", PNAME, path, strerror(errno));
        return 1;
     }
     if (fputs(html, fp) < 0) {
-       fprintf(stderr, "%s: %s: write: %s\n", PNAME,
-               filename, strerror(errno));
+       fprintf(stderr, "%s: %s: write: %s\n", PNAME, path, strerror(errno));
        fclose(fp);
        return 1;
     }
     if (fclose(fp) < 0) {
-       fprintf(stderr, "%s: %s: fclose: %s\n", PNAME,
-               filename, strerror(errno));
+       fprintf(stderr, "%s: %s: fclose: %s\n", PNAME, path, strerror(errno));
        return 1;
     }
+    sfree(path);
 
     /*
      * Recurse.
@@ -711,12 +1070,12 @@ int html_dump(const void *t, unsigned long index, unsigned long endindex,
     if (maxdepth != 0) {
        unsigned long subindex, subendindex;
        int newdepth = (maxdepth > 0 ? maxdepth - 1 : maxdepth);
-       char path[1+trie_maxpathlen(t)];
+       char rpath[1+trie_maxpathlen(t)];
 
        index++;
        while (index < endindex) {
-           trie_getpath(t, index, path);
-           get_indices(t, path, &subindex, &subendindex);
+           trie_getpath(t, index, rpath);
+           get_indices(t, rpath, &subindex, &subendindex);
            index = subendindex;
            if (subendindex - subindex > 1) {
                if (html_dump(t, subindex, subendindex, newdepth,
diff --git a/html.h b/html.h
index dc90c4b..d420036 100644 (file)
--- a/html.h
+++ b/html.h
@@ -4,21 +4,68 @@
 
 struct html_config {
     /*
-     * If "format" is non-NULL, it is treated as an sprintf format
-     * string which must contain exactly one %lu and no other
-     * formatting directives (other than %%, which doesn't count);
-     * this will be used to construct URLs to use in hrefs
-     * pointing to queries of other related (parent and child)
-     * pathnames.
+     * Configure the format of the URI pathname fragment corresponding
+     * to a given tree entry.
+     *
+     * 'uriformat' is expected to have the following format:
+     *  - it consists of one or more _options_, each indicating a
+     *    particular way to format a URI, separated by '%|'
+     *  - each option contains _at most one_ formatting directive;
+     *    without any, it is assumed to only be able to encode the
+     *    root tree entry
+     *  - the formatting directive may be followed before and/or
+     *    afterwards with literal text; percent signs in that literal
+     *    text are specified as %% (which doesn't count as a
+     *    formatting directive for the 'at most one' rule)
+     *  - formatting directives are as follows:
+     *     + '%n' outputs the numeric index (in decimal) of the tree
+     *       entry
+     *     + '%p' outputs the pathname of the tree entry, not counting
+     *       any common prefix of the whole tree or a subdirectory
+     *       separator following that (so that the root directory of
+     *       the tree will always be rendered as the empty string).
+     *       The subdirectory separator is translated into '/'; any
+     *       remotely worrying character is escaped as = followed by
+     *       two hex digits (including, in particular, = itself). The
+     *       only characters not escaped are the ASCII alphabets and
+     *       numbers, the subdirectory separator as mentioned above,
+     *       and the four punctuation characters -.@_ .
+     *     - '%/p' outputs the pathname of the tree entry, but this time
+     *       the subdirectory separator is also considered to be a
+     *       worrying character and is escaped.
+     *     - '%-p' and '%-/p' are like '%p' and '%/p' respectively,
+     *       except that they use the full pathname stored in the tree
+     *       without stripping a common prefix.
+     *
+     * These formats are used both for generating and parsing URI
+     * fragments. When generating, the first valid option is used
+     * (which is always the very first one if we're generating the
+     * root URI, or else it's the first option with any formatting
+     * directive); when parsing, the first option that matches will be
+     * accepted. (Thus, you can have '.../subdir' and '.../subdir/'
+     * both accepted, but make the latter canonical; clients of this
+     * mechanism will typically regenerate a URI string after parsing
+     * an index out of it, and return an HTTP redirect if it isn't in
+     * canonical form.)
+     *
+     * All hyperlinks should be correctly generated as relative (i.e.
+     * with the right number of ../ and ./ considering both the
+     * pathname for the page currently being generated, and the one
+     * for the link target).
+     *
+     * If 'uriformat' is NULL, the HTML is generated without hyperlinks.
      */
-    const char *format;
+    const char *uriformat;
 
     /*
-     * If "rootpage" is non-NULL, it overrides "format" to give a
-     * special name (e.g. "index.html") to the top-level page of the
-     * index.
+     * Configure the filenames output by html_dump(). These can be
+     * configured separately from the URI formats, so that the root
+     * file can be called index.html on disk but have a notional URI
+     * of just / or similar.
+     *
+     * Formatting directives are the same as the uriformat above.
      */
-    const char *rootpage;
+    const char *fileformat;
 
     /*
      * Time stamps to assign to the extreme ends of the colour
@@ -37,6 +84,22 @@ struct html_config {
 };
 
 /*
+ * Parse a URI pathname segment against the URI formats specified in
+ * 'cfg', and return a numeric index in '*index'. Return value is true
+ * on success, or false if the pathname makes no sense, or the index
+ * is out of range, or the index does not correspond to a directory in
+ * the trie.
+ */
+int html_parse_path(const void *t, const char *path,
+                    const struct html_config *cfg, unsigned long *index);
+
+/*
+ * Generate a URI pathname segment from an index.
+ */
+char *html_format_path(const void *t, const struct html_config *cfg,
+                       unsigned long index);
+
+/*
  * Generate an HTML document containing the results of a query
  * against the pathname at a given index. Returns a dynamically
  * allocated piece of memory containing the entire HTML document,
diff --git a/httpd.c b/httpd.c
index dd7aa23..6c90e66 100644 (file)
--- a/httpd.c
+++ b/httpd.c
@@ -268,22 +268,78 @@ char *got_data(struct connctx *ctx, char *data, int length,
                                 "This is a restricted-access set of pages.");
            }
        } else {
-           char *q;
            p = ctx->url;
-           p += strspn(p, "/?");
-           index = strtoul(p, &q, 10);
-           if (*q) {
+           if (!html_parse_path(ctx->t, p, cfg, &index)) {
                ret = http_error("404", "Not Found", NULL,
-                                "This is not a valid pathname index.");
+                                "This is not a valid pathname.");
            } else {
-               document = html_query(ctx->t, index, cfg, 1);
-               if (document) {
-                   ret = http_success("text/html", 1, document);
-                   sfree(document);
-               } else {
-                   ret = http_error("404", "Not Found", NULL,
-                                    "Pathname index out of range.");
-               }
+                char *canonpath = html_format_path(ctx->t, cfg, index);
+                if (!strcmp(canonpath, p)) {
+                    /*
+                     * This is a canonical path. Return the document.
+                     */
+                    document = html_query(ctx->t, index, cfg, 1);
+                    if (document) {
+                        ret = http_success("text/html", 1, document);
+                        sfree(document);
+                    } else {
+                        ret = http_error("404", "Not Found", NULL,
+                                         "This is not a valid pathname.");
+                    }
+                } else {
+                    /*
+                     * This is a non-canonical path. Return a redirect
+                     * to the right one.
+                     *
+                     * To do this, we must search the request headers
+                     * for Host:, to see what the client thought it
+                     * was calling our server.
+                     */
+
+                    char *host = NULL;
+                    q = ctx->data + ctx->datalen;
+                    for (p = ctx->headers; p < q; p++) {
+                        const char *hdr = "Host:";
+                        int i;
+                        for (i = 0; hdr[i]; i++) {
+                            if (p >= q || tolower((unsigned char)*p) !=
+                                tolower((unsigned char)hdr[i]))
+                                break;
+                            p++;
+                        }
+                        if (!hdr[i])
+                            break;     /* found our header */
+                        p = memchr(p, '\n', q - p);
+                        if (!p)
+                            p = q;
+                    }
+                    if (p < q) {
+                        while (p < q && isspace((unsigned char)*p))
+                            p++;
+                        r = p;
+                        while (p < q) {
+                            if (*p == '\r' && (p+1 >= q || p[1] == '\n'))
+                                break;
+                            p++;
+                        }
+                        host = snewn(p-r+1, char);
+                        memcpy(host, r, p-r);
+                        host[p-r] = '\0';
+                    }
+                    if (host) {
+                        char *header = dupfmt("Location: http://%s%s\r\n",
+                                              host, canonpath);
+                        ret = http_error("301", "Moved", header,
+                                         "This is not the canonical form of"
+                                         " this pathname.");
+                        sfree(header);
+                    } else {
+                        ret = http_error("400", "Bad Request", NULL,
+                                         "Needed a Host: header to return"
+                                         " the intended redirection.");
+                    }
+                }
+                sfree(canonpath);
            }
        }
        return ret;
@@ -417,8 +473,6 @@ void run_httpd(const void *t, int authmask, const struct httpd_config *dcfg,
     socklen_t addrlen;
     struct html_config cfg = *incfg;
 
-    cfg.format = "%.0lu";
-
     /*
      * Establish the listening socket and retrieve its port
      * number.