1344bdfd4f4736bb0d7b250a1f2c8dba7c46fa78
[sgt/agedu] / trie.c
1 /*
2 * trie.c: implementation of trie.h.
3 */
4
5 #include <assert.h>
6 #include <stdio.h>
7 #include <string.h>
8 #include <stdlib.h>
9 #include <errno.h>
10
11 #include <sys/types.h>
12 #include <unistd.h>
13
14 #include "agedu.h"
15 #include "malloc.h"
16 #include "trie.h"
17
18 #define alignof(typ) ( offsetof(struct { char c; typ t; }, t) )
19
20 /*
21 * Compare functions for pathnames. Returns the relative order of
22 * the names, like strcmp; also passes back the offset of the
23 * first differing character if desired.
24 */
25 static int trieccmp(unsigned char a, unsigned char b)
26 {
27 a = (a == '\0' ? '\0' : a == pathsep ? '\1' : a+1);
28 b = (b == '\0' ? '\0' : b == pathsep ? '\1' : b+1);
29 return (int)a - (int)b;
30 }
31
32 static int triencmp(const char *a, size_t alen,
33 const char *b, size_t blen, int *offset)
34 {
35 int off = 0;
36 while (off < alen && off < blen && a[off] == b[off])
37 off++;
38 if (offset)
39 *offset = off;
40 if (off == alen || off == blen) return (off == blen) - (off == alen);
41 return trieccmp(a[off], b[off]);
42 }
43
44 static int triecmp(const char *a, const char *b, int *offset)
45 {
46 return triencmp(a, strlen(a), b, strlen(b), offset);
47 }
48
49 /* ----------------------------------------------------------------------
50 * Trie node structures.
51 *
52 * The trie format stored in the file consists of three distinct
53 * node types, each with a distinguishing type field at the start.
54 *
55 * TRIE_LEAF is a leaf node; it contains an actual trie_file
56 * structure, and indicates that when you're searching down the
57 * trie with a string, you should now expect to encounter
58 * end-of-string.
59 *
60 * TRIE_SWITCH indicates that the set of strings in the trie
61 * include strings with more than one distinct character after the
62 * prefix leading up to this point. Hence, it stores multiple
63 * subnode pointers and a different single character for each one.
64 *
65 * TRIE_STRING indicates that at this point everything in the trie
66 * has the same next few characters; it stores a single mandatory
67 * string fragment and exactly one subnode pointer.
68 */
69 enum {
70 TRIE_LEAF = 0x7fffe000,
71 TRIE_SWITCH,
72 TRIE_STRING
73 };
74
75 struct trie_common {
76 int type;
77 };
78
79 struct trie_switchentry {
80 off_t subnode;
81 int subcount;
82 };
83
84 struct trie_leaf {
85 struct trie_common c;
86 struct trie_file file;
87 };
88
89 struct trie_switch {
90 struct trie_common c;
91 /*
92 * sw[0] to sw[len-1] give the subnode pointers and element
93 * counts. At &sw[len] is stored len single bytes which are
94 * the characters corresponding to each subnode.
95 */
96 int len;
97 struct trie_switchentry sw[];
98 };
99
100 struct trie_string {
101 struct trie_common c;
102 int stringlen;
103 off_t subnode;
104 char string[];
105 };
106
107 struct trie_header {
108 unsigned long magic;
109 off_t root, indexroot;
110 int count;
111 size_t maxpathlen;
112 int pathsep;
113 };
114
115 /* Union only used for computing alignment */
116 union trie_node {
117 struct trie_leaf leaf;
118 struct { /* fake trie_switch with indeterminate array length filled in */
119 struct trie_common c;
120 int len;
121 struct trie_switchentry sw[1];
122 } sw;
123 struct { /* fake trie_string with indeterminate array length filled in */
124 struct trie_common c;
125 int stringlen;
126 off_t subnode;
127 char string[1];
128 } str;
129 };
130 #define TRIE_MAGIC 0x75646761UL
131 #define TRIE_ALIGN alignof(union trie_node)
132
133 /* ----------------------------------------------------------------------
134 * Trie-building functions.
135 */
136
137 struct tbswitch {
138 int len;
139 char c[256];
140 off_t off[256];
141 int count[256];
142 };
143
144 struct triebuild {
145 int fd;
146 off_t offset;
147 char *lastpath;
148 int lastlen, lastsize;
149 off_t lastoff;
150 struct tbswitch *switches;
151 int switchsize;
152 size_t maxpathlen;
153 };
154
155 static void tb_seek(triebuild *tb, off_t off)
156 {
157 tb->offset = off;
158 if (lseek(tb->fd, off, SEEK_SET) < 0) {
159 fprintf(stderr, PNAME ": lseek: %s\n", strerror(errno));
160 exit(1);
161 }
162 }
163
164 static void tb_write(triebuild *tb, const void *buf, size_t len)
165 {
166 tb->offset += len;
167 while (len > 0) {
168 int ret = write(tb->fd, buf, len);
169 if (ret < 0) {
170 fprintf(stderr, PNAME ": write: %s\n", strerror(errno));
171 exit(1);
172 }
173 len -= ret;
174 buf = (const void *)((const char *)buf + ret);
175 }
176 }
177
178 static char trie_align_zeroes[TRIE_ALIGN];
179
180 static void tb_align(triebuild *tb)
181 {
182 int off = (TRIE_ALIGN - ((tb)->offset % TRIE_ALIGN)) % TRIE_ALIGN;
183 tb_write(tb, trie_align_zeroes, off);
184 }
185
186 triebuild *triebuild_new(int fd)
187 {
188 triebuild *tb = snew(triebuild);
189 struct trie_header th;
190
191 tb->fd = fd;
192 tb->lastpath = NULL;
193 tb->lastlen = tb->lastsize = 0;
194 tb->lastoff = 0;
195 tb->switches = NULL;
196 tb->switchsize = 0;
197 tb->maxpathlen = 0;
198
199 th.magic = TRIE_MAGIC;
200 th.root = th.count = 0;
201 th.indexroot = 0;
202 th.maxpathlen = 0;
203 th.pathsep = (unsigned char)pathsep;
204
205 tb_seek(tb, 0);
206 tb_write(tb, &th, sizeof(th));
207
208 return tb;
209 }
210
211 static off_t triebuild_unwind(triebuild *tb, int targetdepth, int *outcount)
212 {
213 off_t offset;
214 int count, depth;
215
216 if (tb->lastoff == 0) {
217 *outcount = 0;
218 return 0;
219 }
220
221 offset = tb->lastoff;
222 count = 1;
223 depth = tb->lastlen + 1;
224
225 assert(depth >= targetdepth);
226
227 while (depth > targetdepth) {
228 int odepth = depth;
229 while (depth > targetdepth &&
230 (depth-1 > tb->switchsize || tb->switches[depth-1].len == 0))
231 depth--;
232 if (odepth > depth) {
233 /*
234 * Write out a string node.
235 */
236 size_t nodesize = sizeof(struct trie_string) + odepth - depth;
237 struct trie_string *st = (struct trie_string *)smalloc(nodesize);
238 st->c.type = TRIE_STRING;
239 st->stringlen = odepth - depth;
240 st->subnode = offset;
241 memcpy(st->string, tb->lastpath + depth, odepth - depth);
242 tb_align(tb);
243 offset = tb->offset;
244 tb_write(tb, st, nodesize);
245 sfree(st);
246 }
247
248 assert(depth >= targetdepth);
249 if (depth <= targetdepth)
250 break;
251
252 /*
253 * Now we expect to be sitting just below a switch node.
254 * Add our final entry to it and write it out.
255 */
256 depth--;
257 {
258 struct trie_switch *sw;
259 char *chars;
260 size_t nodesize;
261 int swlen = tb->switches[depth].len;
262 int i;
263
264 assert(swlen > 0);
265
266 tb->switches[depth].c[swlen] = tb->lastpath[depth];
267 tb->switches[depth].off[swlen] = offset;
268 tb->switches[depth].count[swlen] = count;
269 swlen++;
270
271 nodesize = sizeof(struct trie_switch) +
272 swlen * sizeof(struct trie_switchentry) + swlen;
273 sw = (struct trie_switch *)smalloc(nodesize);
274 chars = (char *)&sw->sw[swlen];
275
276 sw->c.type = TRIE_SWITCH;
277 sw->len = swlen;
278 count = 0;
279 for (i = 0; i < swlen; i++) {
280 sw->sw[i].subnode = tb->switches[depth].off[i];
281 sw->sw[i].subcount = tb->switches[depth].count[i];
282 chars[i] = tb->switches[depth].c[i];
283
284 count += tb->switches[depth].count[i];
285 }
286
287 tb_align(tb);
288 offset = tb->offset;
289 tb_write(tb, sw, nodesize);
290 sfree(sw);
291
292 tb->switches[depth].len = 0; /* clear this node */
293 }
294 }
295
296 *outcount = count;
297 return offset;
298 }
299
300 void triebuild_add(triebuild *tb, const char *pathname,
301 const struct trie_file *file)
302 {
303 int pathlen = strlen(pathname);
304 int depth;
305
306 if (tb->maxpathlen < pathlen+1)
307 tb->maxpathlen = pathlen+1;
308
309 if (tb->lastpath) {
310 off_t offset;
311 int count;
312
313 /*
314 * Find the first differing character between this pathname
315 * and the previous one.
316 */
317 int ret = triecmp(tb->lastpath, pathname, &depth);
318 assert(ret < 0);
319
320 /*
321 * Finalise all nodes above this depth.
322 */
323 offset = triebuild_unwind(tb, depth+1, &count);
324
325 /*
326 * Add the final node we just acquired to the switch node
327 * at our chosen depth, creating it if it isn't already
328 * there.
329 */
330 if (tb->switchsize <= depth) {
331 int oldsize = tb->switchsize;
332 tb->switchsize = depth * 3 / 2 + 64;
333 tb->switches = sresize(tb->switches, tb->switchsize,
334 struct tbswitch);
335 while (oldsize < tb->switchsize)
336 tb->switches[oldsize++].len = 0;
337 }
338
339 tb->switches[depth].c[tb->switches[depth].len] = tb->lastpath[depth];
340 tb->switches[depth].off[tb->switches[depth].len] = offset;
341 tb->switches[depth].count[tb->switches[depth].len] = count;
342 tb->switches[depth].len++;
343 }
344
345 /*
346 * Write out a leaf node for the new file, and remember its
347 * file offset.
348 */
349 {
350 struct trie_leaf leaf;
351
352 leaf.c.type = TRIE_LEAF;
353 leaf.file = *file; /* structure copy */
354
355 tb_align(tb);
356 tb->lastoff = tb->offset;
357 tb_write(tb, &leaf, sizeof(leaf));
358 }
359
360 /*
361 * Store this pathname for comparison with the next one.
362 */
363 if (tb->lastsize < pathlen+1) {
364 tb->lastsize = pathlen * 3 / 2 + 64;
365 tb->lastpath = sresize(tb->lastpath, tb->lastsize, char);
366 }
367 strcpy(tb->lastpath, pathname);
368 tb->lastlen = pathlen;
369 }
370
371 int triebuild_finish(triebuild *tb)
372 {
373 struct trie_header th;
374
375 th.magic = TRIE_MAGIC;
376 th.root = triebuild_unwind(tb, 0, &th.count);
377 th.indexroot = 0;
378 th.maxpathlen = tb->maxpathlen;
379 th.pathsep = (unsigned char)pathsep;
380
381 tb_seek(tb, 0);
382 tb_write(tb, &th, sizeof(th));
383
384 return th.count;
385 }
386
387 void triebuild_free(triebuild *tb)
388 {
389 sfree(tb->switches);
390 sfree(tb->lastpath);
391 sfree(tb);
392 }
393
394 /* ----------------------------------------------------------------------
395 * Memory-mapped trie modification.
396 */
397
398 #define MNODE(t, off, type) \
399 ((struct type *)((char *)(t) + (off)))
400
401 static unsigned long long fake_atime_recurse(void *t, struct trie_common *node,
402 int last_seen_pathsep)
403 {
404 while (node->type == TRIE_STRING) {
405 struct trie_string *st = (struct trie_string *)node;
406 last_seen_pathsep = (st->string[st->stringlen-1] == pathsep);
407 node = MNODE(t, st->subnode, trie_common);
408 }
409
410 if (node->type == TRIE_LEAF) {
411 struct trie_leaf *leaf = (struct trie_leaf *)node;
412 return leaf->file.atime;
413 } else if (assert(node->type == TRIE_SWITCH), 1) {
414 struct trie_switch *sw = (struct trie_switch *)node;
415 const char *chars = (const char *)&sw->sw[sw->len];
416 unsigned long long max = 0, subdir, ret;
417 int i;
418 int slashindex = -1, bareindex = -1;
419
420 /*
421 * First, process all the children of this node whose
422 * switch characters are not \0 or pathsep. We do this in
423 * reverse order so as to maintain best cache locality
424 * (tracking generally backwards through the file), though
425 * it doesn't matter semantically.
426 *
427 * For each of these children, we're just recursing into
428 * it to do any fixups required below it, and amalgamating
429 * the max atimes we get back.
430 */
431 for (i = sw->len; i-- > 0 ;) {
432 if (chars[i] == '\0') {
433 bareindex = i;
434 } else if (chars[i] == pathsep) {
435 slashindex = i;
436 } else {
437 ret = fake_atime_recurse(t, MNODE(t, sw->sw[i].subnode,
438 trie_common), 0);
439 if (max < ret)
440 max = ret;
441 }
442 }
443
444 /*
445 * Now we have at most two child nodes left to deal with:
446 * one with a slash (or general pathsep) and one with \0.
447 *
448 * If there's a slash node and a bare node, then the slash
449 * node contains precisely everything inside the directory
450 * described by the bare node; so we must retrieve the max
451 * atime for the slash node and use it to fix up the bare
452 * node.
453 *
454 * If there's only a bare node but the pathname leading up
455 * to this point ends in a slash, then _all_ of the child
456 * nodes of this node contain stuff inside the directory
457 * described by the bare node; so we use the whole of the
458 * maximum value we've computed so far to update the bare
459 * node.
460 */
461 if (slashindex >= 0) {
462 ret = fake_atime_recurse(t, MNODE(t, sw->sw[slashindex].subnode,
463 trie_common), 1);
464 if (max < ret)
465 max = ret;
466
467 subdir = ret;
468 } else if (last_seen_pathsep) {
469 subdir = max;
470 } else {
471 /* Don't update the bare subnode at all. */
472 subdir = 0;
473 }
474
475 if (bareindex >= 0) {
476 struct trie_leaf *leaf;
477
478 leaf = MNODE(t, sw->sw[bareindex].subnode, trie_leaf);
479
480 if (leaf && leaf->c.type == TRIE_LEAF) {
481 if (leaf->file.atime < subdir)
482 leaf->file.atime = subdir;
483 ret = leaf->file.atime;
484 } else {
485 /* Shouldn't really happen, but be cautious anyway */
486 ret = fake_atime_recurse(t, &leaf->c, 0);
487 }
488
489 if (max < ret)
490 max = ret;
491 }
492
493 return max;
494 }
495 }
496
497 void trie_fake_dir_atimes(void *t)
498 {
499 struct trie_header *hdr = MNODE(t, 0, trie_header);
500 struct trie_common *node = MNODE(t, hdr->root, trie_common);
501
502 fake_atime_recurse(t, node, 1);
503 }
504
505 /* ----------------------------------------------------------------------
506 * Querying functions.
507 */
508
509 #define NODE(t, off, type) \
510 ((const struct type *)((const char *)(t) + (off)))
511
512 size_t trie_maxpathlen(const void *t)
513 {
514 const struct trie_header *hdr = NODE(t, 0, trie_header);
515 return hdr->maxpathlen;
516 }
517
518 unsigned long trie_before(const void *t, const char *pathname)
519 {
520 const struct trie_header *hdr = NODE(t, 0, trie_header);
521 int ret = 0, lastcount = hdr->count;
522 int len = 1 + strlen(pathname), depth = 0;
523 off_t off = hdr->root;
524
525 while (1) {
526 const struct trie_common *node = NODE(t, off, trie_common);
527 if (node->type == TRIE_LEAF) {
528 if (depth < len)
529 ret += lastcount; /* _shouldn't_ happen, but in principle */
530 return ret;
531 } else if (node->type == TRIE_STRING) {
532 const struct trie_string *st = NODE(t, off, trie_string);
533
534 int offset;
535 int cmp = triencmp(st->string, st->stringlen,
536 pathname + depth, len-depth, &offset);
537
538 if (offset < st->stringlen) {
539 if (cmp < 0)
540 ret += lastcount;
541 return ret;
542 }
543
544 depth += st->stringlen;
545 off = st->subnode;
546 } else if (node->type == TRIE_SWITCH) {
547 const struct trie_switch *sw = NODE(t, off, trie_switch);
548 const char *chars = (const char *)&sw->sw[sw->len];
549 int i;
550
551 for (i = 0; i < sw->len; i++) {
552 int c = chars[i];
553 int cmp = trieccmp(pathname[depth], c);
554 if (cmp > 0)
555 ret += sw->sw[i].subcount;
556 else if (cmp < 0)
557 return ret;
558 else {
559 off = sw->sw[i].subnode;
560 lastcount = sw->sw[i].subcount;
561 depth++;
562 break;
563 }
564 }
565 if (i == sw->len)
566 return ret;
567 }
568 }
569 }
570
571 void trie_getpath(const void *t, unsigned long n, char *buf)
572 {
573 const struct trie_header *hdr = NODE(t, 0, trie_header);
574 int depth = 0;
575 off_t off = hdr->root;
576
577 while (1) {
578 const struct trie_common *node = NODE(t, off, trie_common);
579 if (node->type == TRIE_LEAF) {
580 assert(depth > 0 && buf[depth-1] == '\0');
581 return;
582 } else if (node->type == TRIE_STRING) {
583 const struct trie_string *st = NODE(t, off, trie_string);
584
585 memcpy(buf + depth, st->string, st->stringlen);
586 depth += st->stringlen;
587 off = st->subnode;
588 } else if (node->type == TRIE_SWITCH) {
589 const struct trie_switch *sw = NODE(t, off, trie_switch);
590 const char *chars = (const char *)&sw->sw[sw->len];
591 int i;
592
593 for (i = 0; i < sw->len; i++) {
594 if (n < sw->sw[i].subcount) {
595 buf[depth++] = chars[i];
596 off = sw->sw[i].subnode;
597 break;
598 } else
599 n -= sw->sw[i].subcount;
600 }
601 assert(i < sw->len);
602 }
603 }
604 }
605
606 unsigned long trie_count(const void *t)
607 {
608 const struct trie_header *hdr = NODE(t, 0, trie_header);
609 return hdr->count;
610 }
611
612 char trie_pathsep(const void *t)
613 {
614 const struct trie_header *hdr = NODE(t, 0, trie_header);
615 return (char)hdr->pathsep;
616 }
617
618 struct triewalk_switch {
619 const struct trie_switch *sw;
620 int pos, depth, count;
621 };
622 struct triewalk {
623 const void *t;
624 struct triewalk_switch *switches;
625 int nswitches, switchsize;
626 int count;
627 };
628 triewalk *triewalk_new(const void *vt)
629 {
630 triewalk *tw = snew(triewalk);
631
632 tw->t = (const char *)vt;
633 tw->switches = NULL;
634 tw->switchsize = 0;
635 tw->nswitches = -1;
636 tw->count = 0;
637
638 return tw;
639 }
640 const struct trie_file *triewalk_next(triewalk *tw, char *buf)
641 {
642 off_t off;
643 int depth;
644
645 if (tw->nswitches < 0) {
646 const struct trie_header *hdr = NODE(tw->t, 0, trie_header);
647 off = hdr->root;
648 depth = 0;
649 tw->nswitches = 0;
650 } else {
651 while (1) {
652 int swpos;
653 const struct trie_switch *sw;
654 const char *chars;
655
656 if (tw->nswitches == 0) {
657 assert(tw->count == NODE(tw->t, 0, trie_header)->count);
658 return NULL; /* run out of trie */
659 }
660
661 swpos = tw->switches[tw->nswitches-1].pos;
662 sw = tw->switches[tw->nswitches-1].sw;
663 chars = (const char *)&sw->sw[sw->len];
664
665 if (swpos < sw->len) {
666 depth = tw->switches[tw->nswitches-1].depth;
667 off = sw->sw[swpos].subnode;
668 if (buf)
669 buf[depth++] = chars[swpos];
670 assert(tw->count == tw->switches[tw->nswitches-1].count);
671 tw->switches[tw->nswitches-1].count += sw->sw[swpos].subcount;
672 tw->switches[tw->nswitches-1].pos++;
673 break;
674 }
675
676 tw->nswitches--;
677 }
678 }
679
680 while (1) {
681 const struct trie_common *node = NODE(tw->t, off, trie_common);
682 if (node->type == TRIE_LEAF) {
683 const struct trie_leaf *lf = NODE(tw->t, off, trie_leaf);
684 if (buf)
685 assert(depth > 0 && buf[depth-1] == '\0');
686 tw->count++;
687 return &lf->file;
688 } else if (node->type == TRIE_STRING) {
689 const struct trie_string *st = NODE(tw->t, off, trie_string);
690
691 if (buf)
692 memcpy(buf + depth, st->string, st->stringlen);
693 depth += st->stringlen;
694 off = st->subnode;
695 } else if (node->type == TRIE_SWITCH) {
696 const struct trie_switch *sw = NODE(tw->t, off, trie_switch);
697 const char *chars = (const char *)&sw->sw[sw->len];
698
699 if (tw->nswitches >= tw->switchsize) {
700 tw->switchsize = tw->nswitches * 3 / 2 + 32;
701 tw->switches = sresize(tw->switches, tw->switchsize,
702 struct triewalk_switch);
703 }
704
705 tw->switches[tw->nswitches].sw = sw;
706 tw->switches[tw->nswitches].pos = 1;
707 tw->switches[tw->nswitches].depth = depth;
708 tw->switches[tw->nswitches].count = tw->count + sw->sw[0].subcount;
709 off = sw->sw[0].subnode;
710 if (buf)
711 buf[depth++] = chars[0];
712 tw->nswitches++;
713 }
714 }
715 }
716 void triewalk_free(triewalk *tw)
717 {
718 sfree(tw->switches);
719 sfree(tw);
720 }
721
722 void trie_set_index_offset(void *t, off_t ptr)
723 {
724 ((struct trie_header *)t)->indexroot = ptr;
725 }
726 off_t trie_get_index_offset(const void *t)
727 {
728 return ((const struct trie_header *)t)->indexroot;
729 }
730
731 void make_successor(char *pathbuf)
732 {
733 int len = strlen(pathbuf);
734 if (len > 0 && pathbuf[len-1] == pathsep)
735 len--;
736 pathbuf[len] = '\001';
737 pathbuf[len+1] = '\0';
738 }