More HTTP server configuration: allow user-chosen address+port and
[sgt/agedu] / trie.c
CommitLineData
70322ae3 1/*
2 * trie.c: implementation of trie.h.
3 */
4
5#include <assert.h>
6#include <stdio.h>
7#include <string.h>
8#include <stdlib.h>
9#include <errno.h>
10
11#include <sys/types.h>
12#include <unistd.h>
13
14#include "malloc.h"
15#include "trie.h"
16
17#define alignof(typ) ( offsetof(struct { char c; typ t; }, t) )
18
19/*
20 * Compare functions for pathnames. Returns the relative order of
21 * the names, like strcmp; also passes back the offset of the
22 * first differing character if desired.
23 */
24static int trieccmp(unsigned char a, unsigned char b)
25{
26 a = (a == '\0' ? '\0' : a == '/' ? '\1' : a+1);
27 b = (b == '\0' ? '\0' : b == '/' ? '\1' : b+1);
28 return a - b;
29}
30
31static int triencmp(const char *a, size_t alen,
32 const char *b, size_t blen, int *offset)
33{
34 int off = 0;
35 while (off < alen && off < blen && a[off] == b[off])
36 off++;
37 if (offset)
38 *offset = off;
39 if (off == alen || off == blen) return (off == blen) - (off == alen);
40 return trieccmp(a[off], b[off]);
41}
42
43static int triecmp(const char *a, const char *b, int *offset)
44{
45 return triencmp(a, strlen(a), b, strlen(b), offset);
46}
47
48/* ----------------------------------------------------------------------
49 * Trie node structures.
50 *
51 * The trie format stored in the file consists of three distinct
52 * node types, each with a distinguishing type field at the start.
53 *
54 * TRIE_LEAF is a leaf node; it contains an actual trie_file
55 * structure, and indicates that when you're searching down the
56 * trie with a string, you should now expect to encounter
57 * end-of-string.
58 *
59 * TRIE_SWITCH indicates that the set of strings in the trie
60 * include strings with more than one distinct character after the
61 * prefix leading up to this point. Hence, it stores multiple
62 * subnode pointers and a different single character for each one.
63 *
64 * TRIE_STRING indicates that at this point everything in the trie
65 * has the same next few characters; it stores a single mandatory
66 * string fragment and exactly one subnode pointer.
67 */
68enum {
69 TRIE_LEAF = 0x7fffe000,
70 TRIE_SWITCH,
71 TRIE_STRING
72};
73
74struct trie_common {
75 int type;
76};
77
78struct trie_switchentry {
79 off_t subnode;
80 int subcount;
81};
82
83struct trie_leaf {
84 struct trie_common c;
85 struct trie_file file;
86};
87
88struct trie_switch {
89 struct trie_common c;
90 /*
91 * sw[0] to sw[len-1] give the subnode pointers and element
92 * counts. At &sw[len] is stored len single bytes which are
93 * the characters corresponding to each subnode.
94 */
95 int len;
96 struct trie_switchentry sw[];
97};
98
99struct trie_string {
100 struct trie_common c;
101 int stringlen;
102 off_t subnode;
103 char string[];
104};
105
106struct trie_header {
107 unsigned long magic;
108 off_t root, indexroot;
109 int count;
110 size_t maxpathlen;
111};
112
113/* Union only used for computing alignment */
114union trie_node {
115 struct trie_leaf leaf;
116 struct { /* fake trie_switch with indeterminate array length filled in */
117 struct trie_common c;
118 int len;
119 struct trie_switchentry sw[1];
120 } sw;
121 struct { /* fake trie_string with indeterminate array length filled in */
122 struct trie_common c;
123 int stringlen;
124 off_t subnode;
125 char string[1];
126 } str;
127};
128#define TRIE_MAGIC 0x75646761UL
129#define TRIE_ALIGN alignof(union trie_node)
130
131/* ----------------------------------------------------------------------
132 * Trie-building functions.
133 */
134
135struct tbswitch {
136 int len;
137 char c[256];
138 off_t off[256];
139 int count[256];
140};
141
142struct triebuild {
143 int fd;
144 off_t offset;
145 char *lastpath;
146 int lastlen, lastsize;
147 off_t lastoff;
148 struct tbswitch *switches;
149 int switchsize;
150 size_t maxpathlen;
151};
152
153static void tb_seek(triebuild *tb, off_t off)
154{
155 tb->offset = off;
156 if (lseek(tb->fd, off, SEEK_SET) < 0) {
157 fprintf(stderr, "agedu: lseek: %s\n", strerror(errno));
158 exit(1);
159 }
160}
161
162static void tb_write(triebuild *tb, const void *buf, size_t len)
163{
164 tb->offset += len;
165 while (len > 0) {
166 int ret = write(tb->fd, buf, len);
167 if (ret < 0) {
168 fprintf(stderr, "agedu: write: %s\n", strerror(errno));
169 exit(1);
170 }
171 len -= ret;
172 buf = (const void *)((const char *)buf + ret);
173 }
174}
175
176static char trie_align_zeroes[TRIE_ALIGN];
177
178static void tb_align(triebuild *tb)
179{
180 int off = (TRIE_ALIGN - ((tb)->offset % TRIE_ALIGN)) % TRIE_ALIGN;
181 tb_write(tb, trie_align_zeroes, off);
182}
183
184triebuild *triebuild_new(int fd)
185{
186 triebuild *tb = snew(triebuild);
187 struct trie_header th;
188
189 tb->fd = fd;
190 tb->lastpath = NULL;
191 tb->lastlen = tb->lastsize = 0;
192 tb->lastoff = 0;
193 tb->switches = NULL;
194 tb->switchsize = 0;
195 tb->maxpathlen = 0;
196
197 th.magic = TRIE_MAGIC;
198 th.root = th.count = 0;
199 th.indexroot = 0;
200 th.maxpathlen = 0;
201
202 tb_seek(tb, 0);
203 tb_write(tb, &th, sizeof(th));
204
205 return tb;
206}
207
208static off_t triebuild_unwind(triebuild *tb, int targetdepth, int *outcount)
209{
210 off_t offset;
211 int count, depth;
212
213 if (tb->lastoff == 0) {
214 *outcount = 0;
215 return 0;
216 }
217
218 offset = tb->lastoff;
219 count = 1;
220 depth = tb->lastlen + 1;
221
222 assert(depth >= targetdepth);
223
224 while (depth > targetdepth) {
225 int odepth = depth;
226 while (depth > targetdepth &&
227 (depth-1 > tb->switchsize || tb->switches[depth-1].len == 0))
228 depth--;
229 if (odepth > depth) {
230 /*
231 * Write out a string node.
232 */
233 size_t nodesize = sizeof(struct trie_string) + odepth - depth;
234 struct trie_string *st = (struct trie_string *)smalloc(nodesize);
235 st->c.type = TRIE_STRING;
236 st->stringlen = odepth - depth;
237 st->subnode = offset;
238 memcpy(st->string, tb->lastpath + depth, odepth - depth);
239 tb_align(tb);
240 offset = tb->offset;
241 tb_write(tb, st, nodesize);
242 sfree(st);
243 }
244
245 assert(depth >= targetdepth);
246 if (depth <= targetdepth)
247 break;
248
249 /*
250 * Now we expect to be sitting just below a switch node.
251 * Add our final entry to it and write it out.
252 */
253 depth--;
254 {
255 struct trie_switch *sw;
256 char *chars;
257 size_t nodesize;
258 int swlen = tb->switches[depth].len;
259 int i;
260
261 assert(swlen > 0);
262
263 tb->switches[depth].c[swlen] = tb->lastpath[depth];
264 tb->switches[depth].off[swlen] = offset;
265 tb->switches[depth].count[swlen] = count;
266 swlen++;
267
268 nodesize = sizeof(struct trie_switch) +
269 swlen * sizeof(struct trie_switchentry) + swlen;
270 sw = (struct trie_switch *)smalloc(nodesize);
271 chars = (char *)&sw->sw[swlen];
272
273 sw->c.type = TRIE_SWITCH;
274 sw->len = swlen;
275 count = 0;
276 for (i = 0; i < swlen; i++) {
277 sw->sw[i].subnode = tb->switches[depth].off[i];
278 sw->sw[i].subcount = tb->switches[depth].count[i];
279 chars[i] = tb->switches[depth].c[i];
280
281 count += tb->switches[depth].count[i];
282 }
283
284 tb_align(tb);
285 offset = tb->offset;
286 tb_write(tb, sw, nodesize);
287 sfree(sw);
288
289 tb->switches[depth].len = 0; /* clear this node */
290 }
291 }
292
293 *outcount = count;
294 return offset;
295}
296
297void triebuild_add(triebuild *tb, const char *pathname,
298 const struct trie_file *file)
299{
300 int pathlen = strlen(pathname);
301 int depth;
302
303 if (tb->maxpathlen < pathlen+1)
304 tb->maxpathlen = pathlen+1;
305
306 if (tb->lastpath) {
307 off_t offset;
308 int count;
309
310 /*
311 * Find the first differing character between this pathname
312 * and the previous one.
313 */
314 int ret = triecmp(tb->lastpath, pathname, &depth);
315 assert(ret < 0);
316
317 /*
318 * Finalise all nodes above this depth.
319 */
320 offset = triebuild_unwind(tb, depth+1, &count);
321
322 /*
323 * Add the final node we just acquired to the switch node
324 * at our chosen depth, creating it if it isn't already
325 * there.
326 */
327 if (tb->switchsize <= depth) {
328 int oldsize = tb->switchsize;
329 tb->switchsize = depth * 3 / 2 + 64;
330 tb->switches = sresize(tb->switches, tb->switchsize,
331 struct tbswitch);
332 while (oldsize < tb->switchsize)
333 tb->switches[oldsize++].len = 0;
334 }
335
336 tb->switches[depth].c[tb->switches[depth].len] = tb->lastpath[depth];
337 tb->switches[depth].off[tb->switches[depth].len] = offset;
338 tb->switches[depth].count[tb->switches[depth].len] = count;
339 tb->switches[depth].len++;
340 }
341
342 /*
343 * Write out a leaf node for the new file, and remember its
344 * file offset.
345 */
346 {
347 struct trie_leaf leaf;
348
349 leaf.c.type = TRIE_LEAF;
350 leaf.file = *file; /* structure copy */
351
352 tb_align(tb);
353 tb->lastoff = tb->offset;
354 tb_write(tb, &leaf, sizeof(leaf));
355 }
356
357 /*
358 * Store this pathname for comparison with the next one.
359 */
360 if (tb->lastsize < pathlen+1) {
361 tb->lastsize = pathlen * 3 / 2 + 64;
362 tb->lastpath = sresize(tb->lastpath, tb->lastsize, char);
363 }
364 strcpy(tb->lastpath, pathname);
365 tb->lastlen = pathlen;
366}
367
368int triebuild_finish(triebuild *tb)
369{
370 struct trie_header th;
371
372 th.magic = TRIE_MAGIC;
373 th.root = triebuild_unwind(tb, 0, &th.count);
374 th.indexroot = 0;
375 th.maxpathlen = tb->maxpathlen;
376
377 tb_seek(tb, 0);
378 tb_write(tb, &th, sizeof(th));
379
380 return th.count;
381}
382
383void triebuild_free(triebuild *tb)
384{
385 sfree(tb->switches);
386 sfree(tb->lastpath);
387 sfree(tb);
388}
389
390/* ----------------------------------------------------------------------
391 * Querying functions.
392 */
393
394#define NODE(t, off, type) \
395 ((const struct type *)((const char *)(t) + (off)))
396
397size_t trie_maxpathlen(const void *t)
398{
399 const struct trie_header *hdr = NODE(t, 0, trie_header);
400 return hdr->maxpathlen;
401}
402
403unsigned long trie_before(const void *t, const char *pathname)
404{
405 const struct trie_header *hdr = NODE(t, 0, trie_header);
406 int ret = 0, lastcount = hdr->count;
407 int len = 1 + strlen(pathname), depth = 0;
408 off_t off = hdr->root;
409
410 while (1) {
411 const struct trie_common *node = NODE(t, off, trie_common);
412 if (node->type == TRIE_LEAF) {
413 if (depth < len)
414 ret += lastcount; /* _shouldn't_ happen, but in principle */
415 return ret;
416 } else if (node->type == TRIE_STRING) {
417 const struct trie_string *st = NODE(t, off, trie_string);
418
419 int offset;
420 int cmp = triencmp(st->string, st->stringlen,
421 pathname + depth, len-depth, &offset);
422
423 if (offset < st->stringlen) {
424 if (cmp < 0)
425 ret += lastcount;
426 return ret;
427 }
428
429 depth += st->stringlen;
430 off = st->subnode;
431 } else if (node->type == TRIE_SWITCH) {
432 const struct trie_switch *sw = NODE(t, off, trie_switch);
433 const char *chars = (const char *)&sw->sw[sw->len];
434 int i;
435
436 for (i = 0; i < sw->len; i++) {
437 int c = chars[i];
438 int cmp = trieccmp(pathname[depth], c);
439 if (cmp > 0)
440 ret += sw->sw[i].subcount;
441 else if (cmp < 0)
442 return ret;
443 else {
444 off = sw->sw[i].subnode;
445 lastcount = sw->sw[i].subcount;
446 depth++;
447 break;
448 }
449 }
450 if (i == sw->len)
451 return ret;
452 }
453 }
454}
455
456void trie_getpath(const void *t, unsigned long n, char *buf)
457{
458 const struct trie_header *hdr = NODE(t, 0, trie_header);
459 int depth = 0;
460 off_t off = hdr->root;
461
462 while (1) {
463 const struct trie_common *node = NODE(t, off, trie_common);
464 if (node->type == TRIE_LEAF) {
465 assert(depth > 0 && buf[depth-1] == '\0');
466 return;
467 } else if (node->type == TRIE_STRING) {
468 const struct trie_string *st = NODE(t, off, trie_string);
469
470 memcpy(buf + depth, st->string, st->stringlen);
471 depth += st->stringlen;
472 off = st->subnode;
473 } else if (node->type == TRIE_SWITCH) {
474 const struct trie_switch *sw = NODE(t, off, trie_switch);
475 const char *chars = (const char *)&sw->sw[sw->len];
476 int i;
477
478 for (i = 0; i < sw->len; i++) {
479 if (n < sw->sw[i].subcount) {
480 buf[depth++] = chars[i];
481 off = sw->sw[i].subnode;
482 break;
483 } else
484 n -= sw->sw[i].subcount;
485 }
486 assert(i < sw->len);
487 }
488 }
489}
490
491unsigned long trie_count(const void *t)
492{
493 const struct trie_header *hdr = NODE(t, 0, trie_header);
494 return hdr->count;
495}
496
497struct triewalk_switch {
498 const struct trie_switch *sw;
499 int pos, depth, count;
500};
501struct triewalk {
502 const void *t;
503 struct triewalk_switch *switches;
504 int nswitches, switchsize;
505 int count;
506};
507triewalk *triewalk_new(const void *vt)
508{
509 triewalk *tw = snew(triewalk);
510
511 tw->t = (const char *)vt;
512 tw->switches = NULL;
513 tw->switchsize = 0;
514 tw->nswitches = -1;
515 tw->count = 0;
516
517 return tw;
518}
519const struct trie_file *triewalk_next(triewalk *tw, char *buf)
520{
521 off_t off;
522 int depth;
523
524 if (tw->nswitches < 0) {
525 const struct trie_header *hdr = NODE(tw->t, 0, trie_header);
526 off = hdr->root;
527 depth = 0;
528 tw->nswitches = 0;
529 } else {
530 while (1) {
531 int swpos;
532 const struct trie_switch *sw;
533 const char *chars;
534
535 if (tw->nswitches == 0) {
536 assert(tw->count == NODE(tw->t, 0, trie_header)->count);
537 return NULL; /* run out of trie */
538 }
539
540 swpos = tw->switches[tw->nswitches-1].pos;
541 sw = tw->switches[tw->nswitches-1].sw;
542 chars = (const char *)&sw->sw[sw->len];
543
544 if (swpos < sw->len) {
545 depth = tw->switches[tw->nswitches-1].depth;
546 off = sw->sw[swpos].subnode;
547 if (buf)
548 buf[depth++] = chars[swpos];
549 assert(tw->count == tw->switches[tw->nswitches-1].count);
550 tw->switches[tw->nswitches-1].count += sw->sw[swpos].subcount;
551 tw->switches[tw->nswitches-1].pos++;
552 break;
553 }
554
555 tw->nswitches--;
556 }
557 }
558
559 while (1) {
560 const struct trie_common *node = NODE(tw->t, off, trie_common);
561 if (node->type == TRIE_LEAF) {
562 const struct trie_leaf *lf = NODE(tw->t, off, trie_leaf);
563 if (buf)
564 assert(depth > 0 && buf[depth-1] == '\0');
565 tw->count++;
566 return &lf->file;
567 } else if (node->type == TRIE_STRING) {
568 const struct trie_string *st = NODE(tw->t, off, trie_string);
569
570 if (buf)
571 memcpy(buf + depth, st->string, st->stringlen);
572 depth += st->stringlen;
573 off = st->subnode;
574 } else if (node->type == TRIE_SWITCH) {
575 const struct trie_switch *sw = NODE(tw->t, off, trie_switch);
576 const char *chars = (const char *)&sw->sw[sw->len];
577
578 if (tw->nswitches >= tw->switchsize) {
579 tw->switchsize = tw->nswitches * 3 / 2 + 32;
580 tw->switches = sresize(tw->switches, tw->switchsize,
581 struct triewalk_switch);
582 }
583
584 tw->switches[tw->nswitches].sw = sw;
585 tw->switches[tw->nswitches].pos = 1;
586 tw->switches[tw->nswitches].depth = depth;
587 tw->switches[tw->nswitches].count = tw->count + sw->sw[0].subcount;
588 off = sw->sw[0].subnode;
589 if (buf)
590 buf[depth++] = chars[0];
591 tw->nswitches++;
592 }
593 }
594}
595void triewalk_free(triewalk *tw)
596{
597 sfree(tw->switches);
598 sfree(tw);
599}
600
601void trie_set_index_offset(void *t, off_t ptr)
602{
603 ((struct trie_header *)t)->indexroot = ptr;
604}
605off_t trie_get_index_offset(const void *t)
606{
607 return ((const struct trie_header *)t)->indexroot;
608}