345756fa910d607801737f21402bd3033ceb5877
[sgt/agedu] / trie.c
1 /*
2 * trie.c: implementation of trie.h.
3 */
4
5 #include "agedu.h"
6 #include "alloc.h"
7 #include "trie.h"
8
9 #define alignof(typ) ( offsetof(struct { char c; typ t; }, t) )
10
11 /*
12 * Compare functions for pathnames. Returns the relative order of
13 * the names, like strcmp; also passes back the offset of the
14 * first differing character if desired.
15 */
16 static int trieccmp(unsigned char a, unsigned char b)
17 {
18 a = (a == '\0' ? '\0' : a == pathsep ? '\1' : a+1);
19 b = (b == '\0' ? '\0' : b == pathsep ? '\1' : b+1);
20 return (int)a - (int)b;
21 }
22
23 static int triencmp(const char *a, size_t alen,
24 const char *b, size_t blen, int *offset)
25 {
26 int off = 0;
27 while (off < alen && off < blen && a[off] == b[off])
28 off++;
29 if (offset)
30 *offset = off;
31 if (off == alen || off == blen) return (off == blen) - (off == alen);
32 return trieccmp(a[off], b[off]);
33 }
34
35 static int triecmp(const char *a, const char *b, int *offset)
36 {
37 return triencmp(a, strlen(a), b, strlen(b), offset);
38 }
39
40 /* ----------------------------------------------------------------------
41 * Trie node structures.
42 *
43 * The trie format stored in the file consists of three distinct
44 * node types, each with a distinguishing type field at the start.
45 *
46 * TRIE_LEAF is a leaf node; it contains an actual trie_file
47 * structure, and indicates that when you're searching down the
48 * trie with a string, you should now expect to encounter
49 * end-of-string.
50 *
51 * TRIE_SWITCH indicates that the set of strings in the trie
52 * include strings with more than one distinct character after the
53 * prefix leading up to this point. Hence, it stores multiple
54 * subnode pointers and a different single character for each one.
55 *
56 * TRIE_STRING indicates that at this point everything in the trie
57 * has the same next few characters; it stores a single mandatory
58 * string fragment and exactly one subnode pointer.
59 */
60 enum {
61 TRIE_LEAF = 0x7fffe000,
62 TRIE_SWITCH,
63 TRIE_STRING
64 };
65
66 struct trie_common {
67 int type;
68 };
69
70 struct trie_switchentry {
71 off_t subnode;
72 int subcount;
73 };
74
75 struct trie_leaf {
76 struct trie_common c;
77 struct trie_file file;
78 };
79
80 struct trie_switch {
81 struct trie_common c;
82 /*
83 * sw[0] to sw[len-1] give the subnode pointers and element
84 * counts. At &sw[len] is stored len single bytes which are
85 * the characters corresponding to each subnode.
86 */
87 int len;
88 struct trie_switchentry sw[];
89 };
90
91 struct trie_string {
92 struct trie_common c;
93 int stringlen;
94 off_t subnode;
95 char string[];
96 };
97
98 struct trie_header {
99 unsigned long magic;
100 off_t root, indexroot;
101 int count;
102 size_t maxpathlen;
103 int pathsep;
104 };
105
106 /* Union only used for computing alignment */
107 union trie_node {
108 struct trie_leaf leaf;
109 struct { /* fake trie_switch with indeterminate array length filled in */
110 struct trie_common c;
111 int len;
112 struct trie_switchentry sw[1];
113 } sw;
114 struct { /* fake trie_string with indeterminate array length filled in */
115 struct trie_common c;
116 int stringlen;
117 off_t subnode;
118 char string[1];
119 } str;
120 };
121 #define TRIE_MAGIC 0x75646761UL
122 #define TRIE_ALIGN alignof(union trie_node)
123
124 /* ----------------------------------------------------------------------
125 * Trie-building functions.
126 */
127
128 struct tbswitch {
129 int len;
130 char c[256];
131 off_t off[256];
132 int count[256];
133 };
134
135 struct triebuild {
136 int fd;
137 off_t offset;
138 char *lastpath;
139 int lastlen, lastsize;
140 off_t lastoff;
141 struct tbswitch *switches;
142 int switchsize;
143 size_t maxpathlen;
144 };
145
146 static void tb_seek(triebuild *tb, off_t off)
147 {
148 tb->offset = off;
149 if (lseek(tb->fd, off, SEEK_SET) < 0) {
150 fprintf(stderr, PNAME ": lseek: %s\n", strerror(errno));
151 exit(1);
152 }
153 }
154
155 static void tb_write(triebuild *tb, const void *buf, size_t len)
156 {
157 tb->offset += len;
158 while (len > 0) {
159 int ret = write(tb->fd, buf, len);
160 if (ret < 0) {
161 fprintf(stderr, PNAME ": write: %s\n", strerror(errno));
162 exit(1);
163 }
164 len -= ret;
165 buf = (const void *)((const char *)buf + ret);
166 }
167 }
168
169 static char trie_align_zeroes[TRIE_ALIGN];
170
171 static void tb_align(triebuild *tb)
172 {
173 int off = (TRIE_ALIGN - ((tb)->offset % TRIE_ALIGN)) % TRIE_ALIGN;
174 tb_write(tb, trie_align_zeroes, off);
175 }
176
177 triebuild *triebuild_new(int fd)
178 {
179 triebuild *tb = snew(triebuild);
180 struct trie_header th;
181
182 tb->fd = fd;
183 tb->lastpath = NULL;
184 tb->lastlen = tb->lastsize = 0;
185 tb->lastoff = 0;
186 tb->switches = NULL;
187 tb->switchsize = 0;
188 tb->maxpathlen = 0;
189
190 th.magic = TRIE_MAGIC;
191 th.root = th.count = 0;
192 th.indexroot = 0;
193 th.maxpathlen = 0;
194 th.pathsep = (unsigned char)pathsep;
195
196 tb_seek(tb, 0);
197 tb_write(tb, &th, sizeof(th));
198
199 return tb;
200 }
201
202 static off_t triebuild_unwind(triebuild *tb, int targetdepth, int *outcount)
203 {
204 off_t offset;
205 int count, depth;
206
207 if (tb->lastoff == 0) {
208 *outcount = 0;
209 return 0;
210 }
211
212 offset = tb->lastoff;
213 count = 1;
214 depth = tb->lastlen + 1;
215
216 assert(depth >= targetdepth);
217
218 while (depth > targetdepth) {
219 int odepth = depth;
220 while (depth > targetdepth &&
221 (depth-1 > tb->switchsize || tb->switches[depth-1].len == 0))
222 depth--;
223 if (odepth > depth) {
224 /*
225 * Write out a string node.
226 */
227 size_t nodesize = sizeof(struct trie_string) + odepth - depth;
228 struct trie_string *st = (struct trie_string *)smalloc(nodesize);
229 st->c.type = TRIE_STRING;
230 st->stringlen = odepth - depth;
231 st->subnode = offset;
232 memcpy(st->string, tb->lastpath + depth, odepth - depth);
233 tb_align(tb);
234 offset = tb->offset;
235 tb_write(tb, st, nodesize);
236 sfree(st);
237 }
238
239 assert(depth >= targetdepth);
240 if (depth <= targetdepth)
241 break;
242
243 /*
244 * Now we expect to be sitting just below a switch node.
245 * Add our final entry to it and write it out.
246 */
247 depth--;
248 {
249 struct trie_switch *sw;
250 char *chars;
251 size_t nodesize;
252 int swlen = tb->switches[depth].len;
253 int i;
254
255 assert(swlen > 0);
256
257 tb->switches[depth].c[swlen] = tb->lastpath[depth];
258 tb->switches[depth].off[swlen] = offset;
259 tb->switches[depth].count[swlen] = count;
260 swlen++;
261
262 nodesize = sizeof(struct trie_switch) +
263 swlen * sizeof(struct trie_switchentry) + swlen;
264 sw = (struct trie_switch *)smalloc(nodesize);
265 chars = (char *)&sw->sw[swlen];
266
267 sw->c.type = TRIE_SWITCH;
268 sw->len = swlen;
269 count = 0;
270 for (i = 0; i < swlen; i++) {
271 sw->sw[i].subnode = tb->switches[depth].off[i];
272 sw->sw[i].subcount = tb->switches[depth].count[i];
273 chars[i] = tb->switches[depth].c[i];
274
275 count += tb->switches[depth].count[i];
276 }
277
278 tb_align(tb);
279 offset = tb->offset;
280 tb_write(tb, sw, nodesize);
281 sfree(sw);
282
283 tb->switches[depth].len = 0; /* clear this node */
284 }
285 }
286
287 *outcount = count;
288 return offset;
289 }
290
291 void triebuild_add(triebuild *tb, const char *pathname,
292 const struct trie_file *file)
293 {
294 int pathlen = strlen(pathname);
295 int depth;
296
297 if (tb->maxpathlen < pathlen+1)
298 tb->maxpathlen = pathlen+1;
299
300 if (tb->lastpath) {
301 off_t offset;
302 int count;
303
304 /*
305 * Find the first differing character between this pathname
306 * and the previous one.
307 */
308 int ret = triecmp(tb->lastpath, pathname, &depth);
309 assert(ret < 0);
310
311 /*
312 * Finalise all nodes above this depth.
313 */
314 offset = triebuild_unwind(tb, depth+1, &count);
315
316 /*
317 * Add the final node we just acquired to the switch node
318 * at our chosen depth, creating it if it isn't already
319 * there.
320 */
321 if (tb->switchsize <= depth) {
322 int oldsize = tb->switchsize;
323 tb->switchsize = depth * 3 / 2 + 64;
324 tb->switches = sresize(tb->switches, tb->switchsize,
325 struct tbswitch);
326 while (oldsize < tb->switchsize)
327 tb->switches[oldsize++].len = 0;
328 }
329
330 tb->switches[depth].c[tb->switches[depth].len] = tb->lastpath[depth];
331 tb->switches[depth].off[tb->switches[depth].len] = offset;
332 tb->switches[depth].count[tb->switches[depth].len] = count;
333 tb->switches[depth].len++;
334 }
335
336 /*
337 * Write out a leaf node for the new file, and remember its
338 * file offset.
339 */
340 {
341 struct trie_leaf leaf;
342
343 leaf.c.type = TRIE_LEAF;
344 leaf.file = *file; /* structure copy */
345
346 tb_align(tb);
347 tb->lastoff = tb->offset;
348 tb_write(tb, &leaf, sizeof(leaf));
349 }
350
351 /*
352 * Store this pathname for comparison with the next one.
353 */
354 if (tb->lastsize < pathlen+1) {
355 tb->lastsize = pathlen * 3 / 2 + 64;
356 tb->lastpath = sresize(tb->lastpath, tb->lastsize, char);
357 }
358 strcpy(tb->lastpath, pathname);
359 tb->lastlen = pathlen;
360 }
361
362 int triebuild_finish(triebuild *tb)
363 {
364 struct trie_header th;
365
366 th.magic = TRIE_MAGIC;
367 th.root = triebuild_unwind(tb, 0, &th.count);
368 th.indexroot = 0;
369 th.maxpathlen = tb->maxpathlen;
370 th.pathsep = (unsigned char)pathsep;
371
372 tb_seek(tb, 0);
373 tb_write(tb, &th, sizeof(th));
374
375 return th.count;
376 }
377
378 void triebuild_free(triebuild *tb)
379 {
380 sfree(tb->switches);
381 sfree(tb->lastpath);
382 sfree(tb);
383 }
384
385 /* ----------------------------------------------------------------------
386 * Memory-mapped trie modification.
387 */
388
389 #define MNODE(t, off, type) \
390 ((struct type *)((char *)(t) + (off)))
391
392 static unsigned long long fake_atime_recurse(void *t, struct trie_common *node,
393 int last_seen_pathsep)
394 {
395 while (node->type == TRIE_STRING) {
396 struct trie_string *st = (struct trie_string *)node;
397 last_seen_pathsep = (st->string[st->stringlen-1] == pathsep);
398 node = MNODE(t, st->subnode, trie_common);
399 }
400
401 if (node->type == TRIE_LEAF) {
402 struct trie_leaf *leaf = (struct trie_leaf *)node;
403 return leaf->file.atime;
404 } else if (assert(node->type == TRIE_SWITCH), 1) {
405 struct trie_switch *sw = (struct trie_switch *)node;
406 const char *chars = (const char *)&sw->sw[sw->len];
407 unsigned long long max = 0, subdir, ret;
408 int i;
409 int slashindex = -1, bareindex = -1;
410
411 /*
412 * First, process all the children of this node whose
413 * switch characters are not \0 or pathsep. We do this in
414 * reverse order so as to maintain best cache locality
415 * (tracking generally backwards through the file), though
416 * it doesn't matter semantically.
417 *
418 * For each of these children, we're just recursing into
419 * it to do any fixups required below it, and amalgamating
420 * the max atimes we get back.
421 */
422 for (i = sw->len; i-- > 0 ;) {
423 if (chars[i] == '\0') {
424 bareindex = i;
425 } else if (chars[i] == pathsep) {
426 slashindex = i;
427 } else {
428 ret = fake_atime_recurse(t, MNODE(t, sw->sw[i].subnode,
429 trie_common), 0);
430 if (max < ret)
431 max = ret;
432 }
433 }
434
435 /*
436 * Now we have at most two child nodes left to deal with:
437 * one with a slash (or general pathsep) and one with \0.
438 *
439 * If there's a slash node and a bare node, then the slash
440 * node contains precisely everything inside the directory
441 * described by the bare node; so we must retrieve the max
442 * atime for the slash node and use it to fix up the bare
443 * node.
444 *
445 * If there's only a bare node but the pathname leading up
446 * to this point ends in a slash, then _all_ of the child
447 * nodes of this node contain stuff inside the directory
448 * described by the bare node; so we use the whole of the
449 * maximum value we've computed so far to update the bare
450 * node.
451 */
452 if (slashindex >= 0) {
453 ret = fake_atime_recurse(t, MNODE(t, sw->sw[slashindex].subnode,
454 trie_common), 1);
455 if (max < ret)
456 max = ret;
457
458 subdir = ret;
459 } else if (last_seen_pathsep) {
460 subdir = max;
461 } else {
462 /* Don't update the bare subnode at all. */
463 subdir = 0;
464 }
465
466 if (bareindex >= 0) {
467 struct trie_leaf *leaf;
468
469 leaf = MNODE(t, sw->sw[bareindex].subnode, trie_leaf);
470
471 if (leaf && leaf->c.type == TRIE_LEAF) {
472 if (leaf->file.atime < subdir)
473 leaf->file.atime = subdir;
474 ret = leaf->file.atime;
475 } else {
476 /* Shouldn't really happen, but be cautious anyway */
477 ret = fake_atime_recurse(t, &leaf->c, 0);
478 }
479
480 if (max < ret)
481 max = ret;
482 }
483
484 return max;
485 }
486 }
487
488 void trie_fake_dir_atimes(void *t)
489 {
490 struct trie_header *hdr = MNODE(t, 0, trie_header);
491 struct trie_common *node = MNODE(t, hdr->root, trie_common);
492
493 fake_atime_recurse(t, node, 1);
494 }
495
496 /* ----------------------------------------------------------------------
497 * Querying functions.
498 */
499
500 #define NODE(t, off, type) \
501 ((const struct type *)((const char *)(t) + (off)))
502
503 size_t trie_maxpathlen(const void *t)
504 {
505 const struct trie_header *hdr = NODE(t, 0, trie_header);
506 return hdr->maxpathlen;
507 }
508
509 unsigned long trie_before(const void *t, const char *pathname)
510 {
511 const struct trie_header *hdr = NODE(t, 0, trie_header);
512 int ret = 0, lastcount = hdr->count;
513 int len = 1 + strlen(pathname), depth = 0;
514 off_t off = hdr->root;
515
516 while (1) {
517 const struct trie_common *node = NODE(t, off, trie_common);
518 if (node->type == TRIE_LEAF) {
519 if (depth < len)
520 ret += lastcount; /* _shouldn't_ happen, but in principle */
521 return ret;
522 } else if (node->type == TRIE_STRING) {
523 const struct trie_string *st = NODE(t, off, trie_string);
524
525 int offset;
526 int cmp = triencmp(st->string, st->stringlen,
527 pathname + depth, len-depth, &offset);
528
529 if (offset < st->stringlen) {
530 if (cmp < 0)
531 ret += lastcount;
532 return ret;
533 }
534
535 depth += st->stringlen;
536 off = st->subnode;
537 } else if (node->type == TRIE_SWITCH) {
538 const struct trie_switch *sw = NODE(t, off, trie_switch);
539 const char *chars = (const char *)&sw->sw[sw->len];
540 int i;
541
542 for (i = 0; i < sw->len; i++) {
543 int c = chars[i];
544 int cmp = trieccmp(pathname[depth], c);
545 if (cmp > 0)
546 ret += sw->sw[i].subcount;
547 else if (cmp < 0)
548 return ret;
549 else {
550 off = sw->sw[i].subnode;
551 lastcount = sw->sw[i].subcount;
552 depth++;
553 break;
554 }
555 }
556 if (i == sw->len)
557 return ret;
558 }
559 }
560 }
561
562 void trie_getpath(const void *t, unsigned long n, char *buf)
563 {
564 const struct trie_header *hdr = NODE(t, 0, trie_header);
565 int depth = 0;
566 off_t off = hdr->root;
567
568 while (1) {
569 const struct trie_common *node = NODE(t, off, trie_common);
570 if (node->type == TRIE_LEAF) {
571 assert(depth > 0 && buf[depth-1] == '\0');
572 return;
573 } else if (node->type == TRIE_STRING) {
574 const struct trie_string *st = NODE(t, off, trie_string);
575
576 memcpy(buf + depth, st->string, st->stringlen);
577 depth += st->stringlen;
578 off = st->subnode;
579 } else if (node->type == TRIE_SWITCH) {
580 const struct trie_switch *sw = NODE(t, off, trie_switch);
581 const char *chars = (const char *)&sw->sw[sw->len];
582 int i;
583
584 for (i = 0; i < sw->len; i++) {
585 if (n < sw->sw[i].subcount) {
586 buf[depth++] = chars[i];
587 off = sw->sw[i].subnode;
588 break;
589 } else
590 n -= sw->sw[i].subcount;
591 }
592 assert(i < sw->len);
593 }
594 }
595 }
596
597 unsigned long trie_count(const void *t)
598 {
599 const struct trie_header *hdr = NODE(t, 0, trie_header);
600 return hdr->count;
601 }
602
603 char trie_pathsep(const void *t)
604 {
605 const struct trie_header *hdr = NODE(t, 0, trie_header);
606 return (char)hdr->pathsep;
607 }
608
609 struct triewalk_switch {
610 const struct trie_switch *sw;
611 int pos, depth, count;
612 };
613 struct triewalk {
614 const void *t;
615 struct triewalk_switch *switches;
616 int nswitches, switchsize;
617 int count;
618 };
619 triewalk *triewalk_new(const void *vt)
620 {
621 triewalk *tw = snew(triewalk);
622
623 tw->t = (const char *)vt;
624 tw->switches = NULL;
625 tw->switchsize = 0;
626 tw->nswitches = -1;
627 tw->count = 0;
628
629 return tw;
630 }
631
632 void triewalk_rebase(triewalk *tw, const void *t)
633 {
634 ptrdiff_t diff = ((const unsigned char *)t - (const unsigned char *)(tw->t));
635 int i;
636
637 tw->t = t;
638
639 for (i = 0; i < tw->nswitches; i++)
640 tw->switches[i].sw = (const struct trie_switch *)
641 ((const unsigned char *)(tw->switches[i].sw) + diff);
642 }
643
644 const struct trie_file *triewalk_next(triewalk *tw, char *buf)
645 {
646 off_t off;
647 int depth;
648
649 if (tw->nswitches < 0) {
650 const struct trie_header *hdr = NODE(tw->t, 0, trie_header);
651 off = hdr->root;
652 depth = 0;
653 tw->nswitches = 0;
654 } else {
655 while (1) {
656 int swpos;
657 const struct trie_switch *sw;
658 const char *chars;
659
660 if (tw->nswitches == 0) {
661 assert(tw->count == NODE(tw->t, 0, trie_header)->count);
662 return NULL; /* run out of trie */
663 }
664
665 swpos = tw->switches[tw->nswitches-1].pos;
666 sw = tw->switches[tw->nswitches-1].sw;
667 chars = (const char *)&sw->sw[sw->len];
668
669 if (swpos < sw->len) {
670 depth = tw->switches[tw->nswitches-1].depth;
671 off = sw->sw[swpos].subnode;
672 if (buf)
673 buf[depth++] = chars[swpos];
674 assert(tw->count == tw->switches[tw->nswitches-1].count);
675 tw->switches[tw->nswitches-1].count += sw->sw[swpos].subcount;
676 tw->switches[tw->nswitches-1].pos++;
677 break;
678 }
679
680 tw->nswitches--;
681 }
682 }
683
684 while (1) {
685 const struct trie_common *node = NODE(tw->t, off, trie_common);
686 if (node->type == TRIE_LEAF) {
687 const struct trie_leaf *lf = NODE(tw->t, off, trie_leaf);
688 if (buf)
689 assert(depth > 0 && buf[depth-1] == '\0');
690 tw->count++;
691 return &lf->file;
692 } else if (node->type == TRIE_STRING) {
693 const struct trie_string *st = NODE(tw->t, off, trie_string);
694
695 if (buf)
696 memcpy(buf + depth, st->string, st->stringlen);
697 depth += st->stringlen;
698 off = st->subnode;
699 } else if (node->type == TRIE_SWITCH) {
700 const struct trie_switch *sw = NODE(tw->t, off, trie_switch);
701 const char *chars = (const char *)&sw->sw[sw->len];
702
703 if (tw->nswitches >= tw->switchsize) {
704 tw->switchsize = tw->nswitches * 3 / 2 + 32;
705 tw->switches = sresize(tw->switches, tw->switchsize,
706 struct triewalk_switch);
707 }
708
709 tw->switches[tw->nswitches].sw = sw;
710 tw->switches[tw->nswitches].pos = 1;
711 tw->switches[tw->nswitches].depth = depth;
712 tw->switches[tw->nswitches].count = tw->count + sw->sw[0].subcount;
713 off = sw->sw[0].subnode;
714 if (buf)
715 buf[depth++] = chars[0];
716 tw->nswitches++;
717 }
718 }
719 }
720 void triewalk_free(triewalk *tw)
721 {
722 sfree(tw->switches);
723 sfree(tw);
724 }
725
726 void trie_set_index_offset(void *t, off_t ptr)
727 {
728 ((struct trie_header *)t)->indexroot = ptr;
729 }
730 off_t trie_get_index_offset(const void *t)
731 {
732 return ((const struct trie_header *)t)->indexroot;
733 }
734
735 void make_successor(char *pathbuf)
736 {
737 int len = strlen(pathbuf);
738 if (len > 0 && pathbuf[len-1] == pathsep)
739 len--;
740 pathbuf[len] = '\001';
741 pathbuf[len+1] = '\0';
742 }