Fiddle about with the configure script so it notices the need for
[sgt/agedu] / trie.c
1 /*
2 * trie.c: implementation of trie.h.
3 */
4
5 #include "agedu.h"
6 #include "alloc.h"
7 #include "trie.h"
8
9 #define alignof(typ) ( offsetof(struct { char c; typ t; }, t) )
10
11 /*
12 * Compare functions for pathnames. Returns the relative order of
13 * the names, like strcmp; also passes back the offset of the
14 * first differing character if desired.
15 */
16 static int trieccmp(unsigned char a, unsigned char b)
17 {
18 int ia = (a == '\0' ? '\0' : a == pathsep ? '\1' : a+1);
19 int ib = (b == '\0' ? '\0' : b == pathsep ? '\1' : b+1);
20 return ia - ib;
21 }
22
23 static int triencmp(const char *a, size_t alen,
24 const char *b, size_t blen, int *offset)
25 {
26 int off = 0;
27 while (off < alen && off < blen && a[off] == b[off])
28 off++;
29 if (offset)
30 *offset = off;
31 if (off == alen || off == blen) return (off == blen) - (off == alen);
32 return trieccmp(a[off], b[off]);
33 }
34
35 static int triecmp(const char *a, const char *b, int *offset)
36 {
37 return triencmp(a, strlen(a), b, strlen(b), offset);
38 }
39
40 /* ----------------------------------------------------------------------
41 * Trie node structures.
42 *
43 * The trie format stored in the file consists of three distinct
44 * node types, each with a distinguishing type field at the start.
45 *
46 * TRIE_LEAF is a leaf node; it contains an actual trie_file
47 * structure, and indicates that when you're searching down the
48 * trie with a string, you should now expect to encounter
49 * end-of-string.
50 *
51 * TRIE_SWITCH indicates that the set of strings in the trie
52 * include strings with more than one distinct character after the
53 * prefix leading up to this point. Hence, it stores multiple
54 * subnode pointers and a different single character for each one.
55 *
56 * TRIE_STRING indicates that at this point everything in the trie
57 * has the same next few characters; it stores a single mandatory
58 * string fragment and exactly one subnode pointer.
59 */
60 enum {
61 TRIE_LEAF = 0x7fffe000,
62 TRIE_SWITCH,
63 TRIE_STRING
64 };
65
66 struct trie_common {
67 int type;
68 };
69
70 struct trie_switchentry {
71 off_t subnode;
72 int subcount;
73 };
74
75 struct trie_leaf {
76 struct trie_common c;
77 struct trie_file file;
78 };
79
80 struct trie_switch {
81 struct trie_common c;
82 /*
83 * sw[0] to sw[len-1] give the subnode pointers and element
84 * counts. At &sw[len] is stored len single bytes which are
85 * the characters corresponding to each subnode.
86 */
87 int len;
88 struct trie_switchentry sw[];
89 };
90
91 struct trie_string {
92 struct trie_common c;
93 int stringlen;
94 off_t subnode;
95 char string[];
96 };
97
98 struct trie_header {
99 unsigned long magic;
100 off_t root, indexroot;
101 int count;
102 size_t maxpathlen;
103 int pathsep;
104 };
105
106 /* Union only used for computing alignment */
107 union trie_node {
108 struct trie_leaf leaf;
109 struct { /* fake trie_switch with indeterminate array length filled in */
110 struct trie_common c;
111 int len;
112 struct trie_switchentry sw[1];
113 } sw;
114 struct { /* fake trie_string with indeterminate array length filled in */
115 struct trie_common c;
116 int stringlen;
117 off_t subnode;
118 char string[1];
119 } str;
120 };
121 #define TRIE_MAGIC 0x75646761UL
122 #define TRIE_ALIGN alignof(union trie_node)
123
124 /* ----------------------------------------------------------------------
125 * Trie-building functions.
126 */
127
128 struct tbswitch {
129 int len;
130 char c[256];
131 off_t off[256];
132 int count[256];
133 };
134
135 struct triebuild {
136 int fd;
137 off_t offset;
138 char *lastpath;
139 int lastlen, lastsize;
140 off_t lastoff;
141 struct tbswitch *switches;
142 int switchsize;
143 size_t maxpathlen;
144 };
145
146 static void tb_seek(triebuild *tb, off_t off)
147 {
148 tb->offset = off;
149 if (lseek(tb->fd, off, SEEK_SET) < 0) {
150 fprintf(stderr, PNAME ": lseek: %s\n", strerror(errno));
151 exit(1);
152 }
153 }
154
155 static void tb_write(triebuild *tb, const void *buf, size_t len)
156 {
157 tb->offset += len;
158 while (len > 0) {
159 int ret = write(tb->fd, buf, len);
160 if (ret < 0) {
161 fprintf(stderr, PNAME ": write: %s\n", strerror(errno));
162 exit(1);
163 }
164 len -= ret;
165 buf = (const void *)((const char *)buf + ret);
166 }
167 }
168
169 static char trie_align_zeroes[TRIE_ALIGN];
170
171 static void tb_align(triebuild *tb)
172 {
173 int off = (TRIE_ALIGN - ((tb)->offset % TRIE_ALIGN)) % TRIE_ALIGN;
174 tb_write(tb, trie_align_zeroes, off);
175 }
176
177 triebuild *triebuild_new(int fd)
178 {
179 triebuild *tb = snew(triebuild);
180 struct trie_header th;
181
182 tb->fd = fd;
183 tb->lastpath = NULL;
184 tb->lastlen = tb->lastsize = 0;
185 tb->lastoff = 0;
186 tb->switches = NULL;
187 tb->switchsize = 0;
188 tb->maxpathlen = 0;
189
190 th.magic = TRIE_MAGIC;
191 th.root = th.count = 0;
192 th.indexroot = 0;
193 th.maxpathlen = 0;
194 th.pathsep = (unsigned char)pathsep;
195
196 tb_seek(tb, 0);
197 tb_write(tb, &th, sizeof(th));
198
199 return tb;
200 }
201
202 static off_t triebuild_unwind(triebuild *tb, int targetdepth, int *outcount)
203 {
204 off_t offset;
205 int count, depth;
206
207 if (tb->lastoff == 0) {
208 *outcount = 0;
209 return 0;
210 }
211
212 offset = tb->lastoff;
213 count = 1;
214 depth = tb->lastlen + 1;
215
216 assert(depth >= targetdepth);
217
218 while (depth > targetdepth) {
219 int odepth = depth;
220 while (depth > targetdepth &&
221 (depth-1 > tb->switchsize || !tb->switches ||
222 tb->switches[depth-1].len == 0))
223 depth--;
224 if (odepth > depth) {
225 /*
226 * Write out a string node.
227 */
228 size_t nodesize = sizeof(struct trie_string) + odepth - depth;
229 struct trie_string *st = (struct trie_string *)smalloc(nodesize);
230 st->c.type = TRIE_STRING;
231 st->stringlen = odepth - depth;
232 st->subnode = offset;
233 memcpy(st->string, tb->lastpath + depth, odepth - depth);
234 tb_align(tb);
235 offset = tb->offset;
236 tb_write(tb, st, nodesize);
237 sfree(st);
238 }
239
240 assert(depth >= targetdepth);
241 if (depth <= targetdepth)
242 break;
243
244 /*
245 * Now we expect to be sitting just below a switch node.
246 * Add our final entry to it and write it out.
247 */
248 depth--;
249 {
250 struct trie_switch *sw;
251 char *chars;
252 size_t nodesize;
253 int swlen = tb->switches[depth].len;
254 int i;
255
256 assert(swlen > 0);
257
258 tb->switches[depth].c[swlen] = tb->lastpath[depth];
259 tb->switches[depth].off[swlen] = offset;
260 tb->switches[depth].count[swlen] = count;
261 swlen++;
262
263 nodesize = sizeof(struct trie_switch) +
264 swlen * sizeof(struct trie_switchentry) + swlen;
265 sw = (struct trie_switch *)smalloc(nodesize);
266 chars = (char *)&sw->sw[swlen];
267
268 sw->c.type = TRIE_SWITCH;
269 sw->len = swlen;
270 count = 0;
271 for (i = 0; i < swlen; i++) {
272 sw->sw[i].subnode = tb->switches[depth].off[i];
273 sw->sw[i].subcount = tb->switches[depth].count[i];
274 chars[i] = tb->switches[depth].c[i];
275
276 count += tb->switches[depth].count[i];
277 }
278
279 tb_align(tb);
280 offset = tb->offset;
281 tb_write(tb, sw, nodesize);
282 sfree(sw);
283
284 tb->switches[depth].len = 0; /* clear this node */
285 }
286 }
287
288 *outcount = count;
289 return offset;
290 }
291
292 void triebuild_add(triebuild *tb, const char *pathname,
293 const struct trie_file *file)
294 {
295 int pathlen = strlen(pathname);
296 int depth;
297
298 if (tb->maxpathlen < pathlen+1)
299 tb->maxpathlen = pathlen+1;
300
301 if (tb->lastpath) {
302 off_t offset;
303 int count;
304
305 /*
306 * Find the first differing character between this pathname
307 * and the previous one.
308 */
309 int ret = triecmp(tb->lastpath, pathname, &depth);
310 assert(ret < 0);
311
312 /*
313 * Finalise all nodes above this depth.
314 */
315 offset = triebuild_unwind(tb, depth+1, &count);
316
317 /*
318 * Add the final node we just acquired to the switch node
319 * at our chosen depth, creating it if it isn't already
320 * there.
321 */
322 if (tb->switchsize <= depth) {
323 int oldsize = tb->switchsize;
324 tb->switchsize = depth * 3 / 2 + 64;
325 tb->switches = sresize(tb->switches, tb->switchsize,
326 struct tbswitch);
327 while (oldsize < tb->switchsize)
328 tb->switches[oldsize++].len = 0;
329 }
330
331 tb->switches[depth].c[tb->switches[depth].len] = tb->lastpath[depth];
332 tb->switches[depth].off[tb->switches[depth].len] = offset;
333 tb->switches[depth].count[tb->switches[depth].len] = count;
334 tb->switches[depth].len++;
335 }
336
337 /*
338 * Write out a leaf node for the new file, and remember its
339 * file offset.
340 */
341 {
342 struct trie_leaf leaf;
343
344 leaf.c.type = TRIE_LEAF;
345 leaf.file = *file; /* structure copy */
346
347 tb_align(tb);
348 tb->lastoff = tb->offset;
349 tb_write(tb, &leaf, sizeof(leaf));
350 }
351
352 /*
353 * Store this pathname for comparison with the next one.
354 */
355 if (tb->lastsize < pathlen+1) {
356 tb->lastsize = pathlen * 3 / 2 + 64;
357 tb->lastpath = sresize(tb->lastpath, tb->lastsize, char);
358 }
359 strcpy(tb->lastpath, pathname);
360 tb->lastlen = pathlen;
361 }
362
363 int triebuild_finish(triebuild *tb)
364 {
365 struct trie_header th;
366
367 th.magic = TRIE_MAGIC;
368 th.root = triebuild_unwind(tb, 0, &th.count);
369 th.indexroot = 0;
370 th.maxpathlen = tb->maxpathlen;
371 th.pathsep = (unsigned char)pathsep;
372
373 tb_seek(tb, 0);
374 tb_write(tb, &th, sizeof(th));
375
376 return th.count;
377 }
378
379 void triebuild_free(triebuild *tb)
380 {
381 sfree(tb->switches);
382 sfree(tb->lastpath);
383 sfree(tb);
384 }
385
386 /* ----------------------------------------------------------------------
387 * Memory-mapped trie modification.
388 */
389
390 #define MNODE(t, off, type) \
391 ((struct type *)((char *)(t) + (off)))
392
393 static unsigned long long fake_atime_recurse(void *t, struct trie_common *node,
394 int last_seen_pathsep)
395 {
396 while (node->type == TRIE_STRING) {
397 struct trie_string *st = (struct trie_string *)node;
398 last_seen_pathsep = (st->string[st->stringlen-1] == pathsep);
399 node = MNODE(t, st->subnode, trie_common);
400 }
401
402 if (node->type == TRIE_LEAF) {
403 struct trie_leaf *leaf = (struct trie_leaf *)node;
404 return leaf->file.atime;
405 } else if (assert(node->type == TRIE_SWITCH), 1) {
406 struct trie_switch *sw = (struct trie_switch *)node;
407 const char *chars = (const char *)&sw->sw[sw->len];
408 unsigned long long max = 0, subdir, ret;
409 int i;
410 int slashindex = -1, bareindex = -1;
411
412 /*
413 * First, process all the children of this node whose
414 * switch characters are not \0 or pathsep. We do this in
415 * reverse order so as to maintain best cache locality
416 * (tracking generally backwards through the file), though
417 * it doesn't matter semantically.
418 *
419 * For each of these children, we're just recursing into
420 * it to do any fixups required below it, and amalgamating
421 * the max atimes we get back.
422 */
423 for (i = sw->len; i-- > 0 ;) {
424 if (chars[i] == '\0') {
425 bareindex = i;
426 } else if (chars[i] == pathsep) {
427 slashindex = i;
428 } else {
429 ret = fake_atime_recurse(t, MNODE(t, sw->sw[i].subnode,
430 trie_common), 0);
431 if (max < ret)
432 max = ret;
433 }
434 }
435
436 /*
437 * Now we have at most two child nodes left to deal with:
438 * one with a slash (or general pathsep) and one with \0.
439 *
440 * If there's a slash node and a bare node, then the slash
441 * node contains precisely everything inside the directory
442 * described by the bare node; so we must retrieve the max
443 * atime for the slash node and use it to fix up the bare
444 * node.
445 *
446 * If there's only a bare node but the pathname leading up
447 * to this point ends in a slash, then _all_ of the child
448 * nodes of this node contain stuff inside the directory
449 * described by the bare node; so we use the whole of the
450 * maximum value we've computed so far to update the bare
451 * node.
452 */
453 if (slashindex >= 0) {
454 ret = fake_atime_recurse(t, MNODE(t, sw->sw[slashindex].subnode,
455 trie_common), 1);
456 if (max < ret)
457 max = ret;
458
459 subdir = ret;
460 } else if (last_seen_pathsep) {
461 subdir = max;
462 } else {
463 /* Don't update the bare subnode at all. */
464 subdir = 0;
465 }
466
467 if (bareindex >= 0) {
468 struct trie_leaf *leaf;
469
470 leaf = MNODE(t, sw->sw[bareindex].subnode, trie_leaf);
471
472 if (leaf && leaf->c.type == TRIE_LEAF) {
473 if (leaf->file.atime < subdir)
474 leaf->file.atime = subdir;
475 ret = leaf->file.atime;
476 } else {
477 /* Shouldn't really happen, but be cautious anyway */
478 ret = fake_atime_recurse(t, &leaf->c, 0);
479 }
480
481 if (max < ret)
482 max = ret;
483 }
484
485 return max;
486 }
487 return 0; /* placate lint */
488 }
489
490 void trie_fake_dir_atimes(void *t)
491 {
492 struct trie_header *hdr = MNODE(t, 0, trie_header);
493 struct trie_common *node = MNODE(t, hdr->root, trie_common);
494
495 fake_atime_recurse(t, node, 1);
496 }
497
498 /* ----------------------------------------------------------------------
499 * Querying functions.
500 */
501
502 #define NODE(t, off, type) \
503 ((const struct type *)((const char *)(t) + (off)))
504
505 size_t trie_maxpathlen(const void *t)
506 {
507 const struct trie_header *hdr = NODE(t, 0, trie_header);
508 return hdr->maxpathlen;
509 }
510
511 unsigned long trie_before(const void *t, const char *pathname)
512 {
513 const struct trie_header *hdr = NODE(t, 0, trie_header);
514 int ret = 0, lastcount = hdr->count;
515 int len = 1 + strlen(pathname), depth = 0;
516 off_t off = hdr->root;
517
518 while (1) {
519 const struct trie_common *node = NODE(t, off, trie_common);
520 if (node->type == TRIE_LEAF) {
521 if (depth < len)
522 ret += lastcount; /* _shouldn't_ happen, but in principle */
523 return ret;
524 } else if (node->type == TRIE_STRING) {
525 const struct trie_string *st = NODE(t, off, trie_string);
526
527 int offset;
528 int cmp = triencmp(st->string, st->stringlen,
529 pathname + depth, len-depth, &offset);
530
531 if (offset < st->stringlen) {
532 if (cmp < 0)
533 ret += lastcount;
534 return ret;
535 }
536
537 depth += st->stringlen;
538 off = st->subnode;
539 } else if (node->type == TRIE_SWITCH) {
540 const struct trie_switch *sw = NODE(t, off, trie_switch);
541 const char *chars = (const char *)&sw->sw[sw->len];
542 int i;
543
544 for (i = 0; i < sw->len; i++) {
545 int c = chars[i];
546 int cmp = trieccmp(pathname[depth], c);
547 if (cmp > 0)
548 ret += sw->sw[i].subcount;
549 else if (cmp < 0)
550 return ret;
551 else {
552 off = sw->sw[i].subnode;
553 lastcount = sw->sw[i].subcount;
554 depth++;
555 break;
556 }
557 }
558 if (i == sw->len)
559 return ret;
560 }
561 }
562 }
563
564 void trie_getpath(const void *t, unsigned long n, char *buf)
565 {
566 const struct trie_header *hdr = NODE(t, 0, trie_header);
567 int depth = 0;
568 off_t off = hdr->root;
569
570 while (1) {
571 const struct trie_common *node = NODE(t, off, trie_common);
572 if (node->type == TRIE_LEAF) {
573 assert(depth > 0 && buf[depth-1] == '\0');
574 return;
575 } else if (node->type == TRIE_STRING) {
576 const struct trie_string *st = NODE(t, off, trie_string);
577
578 memcpy(buf + depth, st->string, st->stringlen);
579 depth += st->stringlen;
580 off = st->subnode;
581 } else if (node->type == TRIE_SWITCH) {
582 const struct trie_switch *sw = NODE(t, off, trie_switch);
583 const char *chars = (const char *)&sw->sw[sw->len];
584 int i;
585
586 for (i = 0; i < sw->len; i++) {
587 if (n < sw->sw[i].subcount) {
588 buf[depth++] = chars[i];
589 off = sw->sw[i].subnode;
590 break;
591 } else
592 n -= sw->sw[i].subcount;
593 }
594 assert(i < sw->len);
595 }
596 }
597 }
598
599 const struct trie_file *trie_getfile(const void *t, unsigned long n)
600 {
601 const struct trie_header *hdr = NODE(t, 0, trie_header);
602 off_t off = hdr->root;
603
604 while (1) {
605 const struct trie_common *node = NODE(t, off, trie_common);
606 if (node->type == TRIE_LEAF) {
607 const struct trie_leaf *leaf = NODE(t, off, trie_leaf);
608 return &leaf->file;
609 } else if (node->type == TRIE_STRING) {
610 const struct trie_string *st = NODE(t, off, trie_string);
611 off = st->subnode;
612 } else if (node->type == TRIE_SWITCH) {
613 const struct trie_switch *sw = NODE(t, off, trie_switch);
614 int i;
615
616 for (i = 0; i < sw->len; i++) {
617 if (n < sw->sw[i].subcount) {
618 off = sw->sw[i].subnode;
619 break;
620 } else
621 n -= sw->sw[i].subcount;
622 }
623 assert(i < sw->len);
624 }
625 }
626 }
627
628 unsigned long trie_count(const void *t)
629 {
630 const struct trie_header *hdr = NODE(t, 0, trie_header);
631 return hdr->count;
632 }
633
634 char trie_pathsep(const void *t)
635 {
636 const struct trie_header *hdr = NODE(t, 0, trie_header);
637 return (char)hdr->pathsep;
638 }
639
640 struct triewalk_switch {
641 const struct trie_switch *sw;
642 int pos, depth, count;
643 };
644 struct triewalk {
645 const void *t;
646 struct triewalk_switch *switches;
647 int nswitches, switchsize;
648 int count;
649 };
650 triewalk *triewalk_new(const void *vt)
651 {
652 triewalk *tw = snew(triewalk);
653
654 tw->t = (const char *)vt;
655 tw->switches = NULL;
656 tw->switchsize = 0;
657 tw->nswitches = -1;
658 tw->count = 0;
659
660 return tw;
661 }
662
663 void triewalk_rebase(triewalk *tw, const void *t)
664 {
665 ptrdiff_t diff = ((const unsigned char *)t - (const unsigned char *)(tw->t));
666 int i;
667
668 tw->t = t;
669
670 for (i = 0; i < tw->nswitches; i++)
671 tw->switches[i].sw = (const struct trie_switch *)
672 ((const unsigned char *)(tw->switches[i].sw) + diff);
673 }
674
675 const struct trie_file *triewalk_next(triewalk *tw, char *buf)
676 {
677 off_t off;
678 int depth;
679
680 if (tw->nswitches < 0) {
681 const struct trie_header *hdr = NODE(tw->t, 0, trie_header);
682 off = hdr->root;
683 depth = 0;
684 tw->nswitches = 0;
685 } else {
686 while (1) {
687 int swpos;
688 const struct trie_switch *sw;
689 const char *chars;
690
691 if (tw->nswitches == 0) {
692 assert(tw->count == NODE(tw->t, 0, trie_header)->count);
693 return NULL; /* run out of trie */
694 }
695
696 swpos = tw->switches[tw->nswitches-1].pos;
697 sw = tw->switches[tw->nswitches-1].sw;
698 chars = (const char *)&sw->sw[sw->len];
699
700 if (swpos < sw->len) {
701 depth = tw->switches[tw->nswitches-1].depth;
702 off = sw->sw[swpos].subnode;
703 if (buf)
704 buf[depth++] = chars[swpos];
705 assert(tw->count == tw->switches[tw->nswitches-1].count);
706 tw->switches[tw->nswitches-1].count += sw->sw[swpos].subcount;
707 tw->switches[tw->nswitches-1].pos++;
708 break;
709 }
710
711 tw->nswitches--;
712 }
713 }
714
715 while (1) {
716 const struct trie_common *node = NODE(tw->t, off, trie_common);
717 if (node->type == TRIE_LEAF) {
718 const struct trie_leaf *lf = NODE(tw->t, off, trie_leaf);
719 if (buf)
720 assert(depth > 0 && buf[depth-1] == '\0');
721 tw->count++;
722 return &lf->file;
723 } else if (node->type == TRIE_STRING) {
724 const struct trie_string *st = NODE(tw->t, off, trie_string);
725
726 if (buf)
727 memcpy(buf + depth, st->string, st->stringlen);
728 depth += st->stringlen;
729 off = st->subnode;
730 } else if (node->type == TRIE_SWITCH) {
731 const struct trie_switch *sw = NODE(tw->t, off, trie_switch);
732 const char *chars = (const char *)&sw->sw[sw->len];
733
734 if (tw->nswitches >= tw->switchsize) {
735 tw->switchsize = tw->nswitches * 3 / 2 + 32;
736 tw->switches = sresize(tw->switches, tw->switchsize,
737 struct triewalk_switch);
738 }
739
740 tw->switches[tw->nswitches].sw = sw;
741 tw->switches[tw->nswitches].pos = 1;
742 tw->switches[tw->nswitches].depth = depth;
743 tw->switches[tw->nswitches].count = tw->count + sw->sw[0].subcount;
744 off = sw->sw[0].subnode;
745 if (buf)
746 buf[depth++] = chars[0];
747 tw->nswitches++;
748 }
749 }
750 }
751 void triewalk_free(triewalk *tw)
752 {
753 sfree(tw->switches);
754 sfree(tw);
755 }
756
757 void trie_set_index_offset(void *t, off_t ptr)
758 {
759 ((struct trie_header *)t)->indexroot = ptr;
760 }
761 off_t trie_get_index_offset(const void *t)
762 {
763 return ((const struct trie_header *)t)->indexroot;
764 }
765
766 void make_successor(char *pathbuf)
767 {
768 int len = strlen(pathbuf);
769 if (len > 0 && pathbuf[len-1] == pathsep)
770 len--;
771 pathbuf[len] = '\001';
772 pathbuf[len+1] = '\0';
773 }