2 * trie.c: implementation of trie.h.
11 #include <sys/types.h>
18 #define alignof(typ) ( offsetof(struct { char c; typ t; }, t) )
21 * Compare functions for pathnames. Returns the relative order of
22 * the names, like strcmp; also passes back the offset of the
23 * first differing character if desired.
25 static int trieccmp(unsigned char a
, unsigned char b
)
27 a
= (a
== '\0' ?
'\0' : a
== pathsep ?
'\1' : a
+1);
28 b
= (b
== '\0' ?
'\0' : b
== pathsep ?
'\1' : b
+1);
29 return (int)a
- (int)b
;
32 static int triencmp(const char *a
, size_t alen
,
33 const char *b
, size_t blen
, int *offset
)
36 while (off
< alen
&& off
< blen
&& a
[off
] == b
[off
])
40 if (off
== alen
|| off
== blen
) return (off
== blen
) - (off
== alen
);
41 return trieccmp(a
[off
], b
[off
]);
44 static int triecmp(const char *a
, const char *b
, int *offset
)
46 return triencmp(a
, strlen(a
), b
, strlen(b
), offset
);
49 /* ----------------------------------------------------------------------
50 * Trie node structures.
52 * The trie format stored in the file consists of three distinct
53 * node types, each with a distinguishing type field at the start.
55 * TRIE_LEAF is a leaf node; it contains an actual trie_file
56 * structure, and indicates that when you're searching down the
57 * trie with a string, you should now expect to encounter
60 * TRIE_SWITCH indicates that the set of strings in the trie
61 * include strings with more than one distinct character after the
62 * prefix leading up to this point. Hence, it stores multiple
63 * subnode pointers and a different single character for each one.
65 * TRIE_STRING indicates that at this point everything in the trie
66 * has the same next few characters; it stores a single mandatory
67 * string fragment and exactly one subnode pointer.
70 TRIE_LEAF
= 0x7fffe000,
79 struct trie_switchentry
{
86 struct trie_file file
;
92 * sw[0] to sw[len-1] give the subnode pointers and element
93 * counts. At &sw[len] is stored len single bytes which are
94 * the characters corresponding to each subnode.
97 struct trie_switchentry sw
[];
101 struct trie_common c
;
109 off_t root
, indexroot
;
115 /* Union only used for computing alignment */
117 struct trie_leaf leaf
;
118 struct { /* fake trie_switch with indeterminate array length filled in */
119 struct trie_common c
;
121 struct trie_switchentry sw
[1];
123 struct { /* fake trie_string with indeterminate array length filled in */
124 struct trie_common c
;
130 #define TRIE_MAGIC 0x75646761UL
131 #define TRIE_ALIGN alignof(union trie_node)
133 /* ----------------------------------------------------------------------
134 * Trie-building functions.
148 int lastlen
, lastsize
;
150 struct tbswitch
*switches
;
155 static void tb_seek(triebuild
*tb
, off_t off
)
158 if (lseek(tb
->fd
, off
, SEEK_SET
) < 0) {
159 fprintf(stderr
, PNAME
": lseek: %s\n", strerror(errno
));
164 static void tb_write(triebuild
*tb
, const void *buf
, size_t len
)
168 int ret
= write(tb
->fd
, buf
, len
);
170 fprintf(stderr
, PNAME
": write: %s\n", strerror(errno
));
174 buf
= (const void *)((const char *)buf
+ ret
);
178 static char trie_align_zeroes
[TRIE_ALIGN
];
180 static void tb_align(triebuild
*tb
)
182 int off
= (TRIE_ALIGN
- ((tb
)->offset
% TRIE_ALIGN
)) % TRIE_ALIGN
;
183 tb_write(tb
, trie_align_zeroes
, off
);
186 triebuild
*triebuild_new(int fd
)
188 triebuild
*tb
= snew(triebuild
);
189 struct trie_header th
;
193 tb
->lastlen
= tb
->lastsize
= 0;
199 th
.magic
= TRIE_MAGIC
;
200 th
.root
= th
.count
= 0;
203 th
.pathsep
= (unsigned char)pathsep
;
206 tb_write(tb
, &th
, sizeof(th
));
211 static off_t
triebuild_unwind(triebuild
*tb
, int targetdepth
, int *outcount
)
216 if (tb
->lastoff
== 0) {
221 offset
= tb
->lastoff
;
223 depth
= tb
->lastlen
+ 1;
225 assert(depth
>= targetdepth
);
227 while (depth
> targetdepth
) {
229 while (depth
> targetdepth
&&
230 (depth
-1 > tb
->switchsize
|| tb
->switches
[depth
-1].len
== 0))
232 if (odepth
> depth
) {
234 * Write out a string node.
236 size_t nodesize
= sizeof(struct trie_string
) + odepth
- depth
;
237 struct trie_string
*st
= (struct trie_string
*)smalloc(nodesize
);
238 st
->c
.type
= TRIE_STRING
;
239 st
->stringlen
= odepth
- depth
;
240 st
->subnode
= offset
;
241 memcpy(st
->string
, tb
->lastpath
+ depth
, odepth
- depth
);
244 tb_write(tb
, st
, nodesize
);
248 assert(depth
>= targetdepth
);
249 if (depth
<= targetdepth
)
253 * Now we expect to be sitting just below a switch node.
254 * Add our final entry to it and write it out.
258 struct trie_switch
*sw
;
261 int swlen
= tb
->switches
[depth
].len
;
266 tb
->switches
[depth
].c
[swlen
] = tb
->lastpath
[depth
];
267 tb
->switches
[depth
].off
[swlen
] = offset
;
268 tb
->switches
[depth
].count
[swlen
] = count
;
271 nodesize
= sizeof(struct trie_switch
) +
272 swlen
* sizeof(struct trie_switchentry
) + swlen
;
273 sw
= (struct trie_switch
*)smalloc(nodesize
);
274 chars
= (char *)&sw
->sw
[swlen
];
276 sw
->c
.type
= TRIE_SWITCH
;
279 for (i
= 0; i
< swlen
; i
++) {
280 sw
->sw
[i
].subnode
= tb
->switches
[depth
].off
[i
];
281 sw
->sw
[i
].subcount
= tb
->switches
[depth
].count
[i
];
282 chars
[i
] = tb
->switches
[depth
].c
[i
];
284 count
+= tb
->switches
[depth
].count
[i
];
289 tb_write(tb
, sw
, nodesize
);
292 tb
->switches
[depth
].len
= 0; /* clear this node */
300 void triebuild_add(triebuild
*tb
, const char *pathname
,
301 const struct trie_file
*file
)
303 int pathlen
= strlen(pathname
);
306 if (tb
->maxpathlen
< pathlen
+1)
307 tb
->maxpathlen
= pathlen
+1;
314 * Find the first differing character between this pathname
315 * and the previous one.
317 int ret
= triecmp(tb
->lastpath
, pathname
, &depth
);
321 * Finalise all nodes above this depth.
323 offset
= triebuild_unwind(tb
, depth
+1, &count
);
326 * Add the final node we just acquired to the switch node
327 * at our chosen depth, creating it if it isn't already
330 if (tb
->switchsize
<= depth
) {
331 int oldsize
= tb
->switchsize
;
332 tb
->switchsize
= depth
* 3 / 2 + 64;
333 tb
->switches
= sresize(tb
->switches
, tb
->switchsize
,
335 while (oldsize
< tb
->switchsize
)
336 tb
->switches
[oldsize
++].len
= 0;
339 tb
->switches
[depth
].c
[tb
->switches
[depth
].len
] = tb
->lastpath
[depth
];
340 tb
->switches
[depth
].off
[tb
->switches
[depth
].len
] = offset
;
341 tb
->switches
[depth
].count
[tb
->switches
[depth
].len
] = count
;
342 tb
->switches
[depth
].len
++;
346 * Write out a leaf node for the new file, and remember its
350 struct trie_leaf leaf
;
352 leaf
.c
.type
= TRIE_LEAF
;
353 leaf
.file
= *file
; /* structure copy */
356 tb
->lastoff
= tb
->offset
;
357 tb_write(tb
, &leaf
, sizeof(leaf
));
361 * Store this pathname for comparison with the next one.
363 if (tb
->lastsize
< pathlen
+1) {
364 tb
->lastsize
= pathlen
* 3 / 2 + 64;
365 tb
->lastpath
= sresize(tb
->lastpath
, tb
->lastsize
, char);
367 strcpy(tb
->lastpath
, pathname
);
368 tb
->lastlen
= pathlen
;
371 int triebuild_finish(triebuild
*tb
)
373 struct trie_header th
;
375 th
.magic
= TRIE_MAGIC
;
376 th
.root
= triebuild_unwind(tb
, 0, &th
.count
);
378 th
.maxpathlen
= tb
->maxpathlen
;
379 th
.pathsep
= (unsigned char)pathsep
;
382 tb_write(tb
, &th
, sizeof(th
));
387 void triebuild_free(triebuild
*tb
)
394 /* ----------------------------------------------------------------------
395 * Memory-mapped trie modification.
398 #define MNODE(t, off, type) \
399 ((struct type *)((char *)(t) + (off)))
401 static unsigned long long fake_atime_recurse(void *t
, struct trie_common
*node
,
402 int last_seen_pathsep
)
404 while (node
->type
== TRIE_STRING
) {
405 struct trie_string
*st
= (struct trie_string
*)node
;
406 last_seen_pathsep
= (st
->string
[st
->stringlen
-1] == pathsep
);
407 node
= MNODE(t
, st
->subnode
, trie_common
);
410 if (node
->type
== TRIE_LEAF
) {
411 struct trie_leaf
*leaf
= (struct trie_leaf
*)node
;
412 return leaf
->file
.atime
;
413 } else if (assert(node
->type
== TRIE_SWITCH
), 1) {
414 struct trie_switch
*sw
= (struct trie_switch
*)node
;
415 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
416 unsigned long long max
= 0, subdir
, ret
;
418 int slashindex
= -1, bareindex
= -1;
421 * First, process all the children of this node whose
422 * switch characters are not \0 or pathsep. We do this in
423 * reverse order so as to maintain best cache locality
424 * (tracking generally backwards through the file), though
425 * it doesn't matter semantically.
427 * For each of these children, we're just recursing into
428 * it to do any fixups required below it, and amalgamating
429 * the max atimes we get back.
431 for (i
= sw
->len
; i
-- > 0 ;) {
432 if (chars
[i
] == '\0') {
434 } else if (chars
[i
] == pathsep
) {
437 ret
= fake_atime_recurse(t
, MNODE(t
, sw
->sw
[i
].subnode
,
445 * Now we have at most two child nodes left to deal with:
446 * one with a slash (or general pathsep) and one with \0.
448 * If there's a slash node and a bare node, then the slash
449 * node contains precisely everything inside the directory
450 * described by the bare node; so we must retrieve the max
451 * atime for the slash node and use it to fix up the bare
454 * If there's only a bare node but the pathname leading up
455 * to this point ends in a slash, then _all_ of the child
456 * nodes of this node contain stuff inside the directory
457 * described by the bare node; so we use the whole of the
458 * maximum value we've computed so far to update the bare
461 if (slashindex
>= 0) {
462 ret
= fake_atime_recurse(t
, MNODE(t
, sw
->sw
[slashindex
].subnode
,
468 } else if (last_seen_pathsep
) {
471 /* Don't update the bare subnode at all. */
475 if (bareindex
>= 0) {
476 struct trie_leaf
*leaf
;
478 leaf
= MNODE(t
, sw
->sw
[bareindex
].subnode
, trie_leaf
);
480 if (leaf
&& leaf
->c
.type
== TRIE_LEAF
) {
481 if (leaf
->file
.atime
< subdir
)
482 leaf
->file
.atime
= subdir
;
483 ret
= leaf
->file
.atime
;
485 /* Shouldn't really happen, but be cautious anyway */
486 ret
= fake_atime_recurse(t
, &leaf
->c
, 0);
497 void trie_fake_dir_atimes(void *t
)
499 struct trie_header
*hdr
= MNODE(t
, 0, trie_header
);
500 struct trie_common
*node
= MNODE(t
, hdr
->root
, trie_common
);
502 fake_atime_recurse(t
, node
, 1);
505 /* ----------------------------------------------------------------------
506 * Querying functions.
509 #define NODE(t, off, type) \
510 ((const struct type *)((const char *)(t) + (off)))
512 size_t trie_maxpathlen(const void *t
)
514 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
515 return hdr
->maxpathlen
;
518 unsigned long trie_before(const void *t
, const char *pathname
)
520 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
521 int ret
= 0, lastcount
= hdr
->count
;
522 int len
= 1 + strlen(pathname
), depth
= 0;
523 off_t off
= hdr
->root
;
526 const struct trie_common
*node
= NODE(t
, off
, trie_common
);
527 if (node
->type
== TRIE_LEAF
) {
529 ret
+= lastcount
; /* _shouldn't_ happen, but in principle */
531 } else if (node
->type
== TRIE_STRING
) {
532 const struct trie_string
*st
= NODE(t
, off
, trie_string
);
535 int cmp
= triencmp(st
->string
, st
->stringlen
,
536 pathname
+ depth
, len
-depth
, &offset
);
538 if (offset
< st
->stringlen
) {
544 depth
+= st
->stringlen
;
546 } else if (node
->type
== TRIE_SWITCH
) {
547 const struct trie_switch
*sw
= NODE(t
, off
, trie_switch
);
548 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
551 for (i
= 0; i
< sw
->len
; i
++) {
553 int cmp
= trieccmp(pathname
[depth
], c
);
555 ret
+= sw
->sw
[i
].subcount
;
559 off
= sw
->sw
[i
].subnode
;
560 lastcount
= sw
->sw
[i
].subcount
;
571 void trie_getpath(const void *t
, unsigned long n
, char *buf
)
573 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
575 off_t off
= hdr
->root
;
578 const struct trie_common
*node
= NODE(t
, off
, trie_common
);
579 if (node
->type
== TRIE_LEAF
) {
580 assert(depth
> 0 && buf
[depth
-1] == '\0');
582 } else if (node
->type
== TRIE_STRING
) {
583 const struct trie_string
*st
= NODE(t
, off
, trie_string
);
585 memcpy(buf
+ depth
, st
->string
, st
->stringlen
);
586 depth
+= st
->stringlen
;
588 } else if (node
->type
== TRIE_SWITCH
) {
589 const struct trie_switch
*sw
= NODE(t
, off
, trie_switch
);
590 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
593 for (i
= 0; i
< sw
->len
; i
++) {
594 if (n
< sw
->sw
[i
].subcount
) {
595 buf
[depth
++] = chars
[i
];
596 off
= sw
->sw
[i
].subnode
;
599 n
-= sw
->sw
[i
].subcount
;
606 unsigned long trie_count(const void *t
)
608 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
612 char trie_pathsep(const void *t
)
614 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
615 return (char)hdr
->pathsep
;
618 struct triewalk_switch
{
619 const struct trie_switch
*sw
;
620 int pos
, depth
, count
;
624 struct triewalk_switch
*switches
;
625 int nswitches
, switchsize
;
628 triewalk
*triewalk_new(const void *vt
)
630 triewalk
*tw
= snew(triewalk
);
632 tw
->t
= (const char *)vt
;
640 const struct trie_file
*triewalk_next(triewalk
*tw
, char *buf
)
645 if (tw
->nswitches
< 0) {
646 const struct trie_header
*hdr
= NODE(tw
->t
, 0, trie_header
);
653 const struct trie_switch
*sw
;
656 if (tw
->nswitches
== 0) {
657 assert(tw
->count
== NODE(tw
->t
, 0, trie_header
)->count
);
658 return NULL
; /* run out of trie */
661 swpos
= tw
->switches
[tw
->nswitches
-1].pos
;
662 sw
= tw
->switches
[tw
->nswitches
-1].sw
;
663 chars
= (const char *)&sw
->sw
[sw
->len
];
665 if (swpos
< sw
->len
) {
666 depth
= tw
->switches
[tw
->nswitches
-1].depth
;
667 off
= sw
->sw
[swpos
].subnode
;
669 buf
[depth
++] = chars
[swpos
];
670 assert(tw
->count
== tw
->switches
[tw
->nswitches
-1].count
);
671 tw
->switches
[tw
->nswitches
-1].count
+= sw
->sw
[swpos
].subcount
;
672 tw
->switches
[tw
->nswitches
-1].pos
++;
681 const struct trie_common
*node
= NODE(tw
->t
, off
, trie_common
);
682 if (node
->type
== TRIE_LEAF
) {
683 const struct trie_leaf
*lf
= NODE(tw
->t
, off
, trie_leaf
);
685 assert(depth
> 0 && buf
[depth
-1] == '\0');
688 } else if (node
->type
== TRIE_STRING
) {
689 const struct trie_string
*st
= NODE(tw
->t
, off
, trie_string
);
692 memcpy(buf
+ depth
, st
->string
, st
->stringlen
);
693 depth
+= st
->stringlen
;
695 } else if (node
->type
== TRIE_SWITCH
) {
696 const struct trie_switch
*sw
= NODE(tw
->t
, off
, trie_switch
);
697 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
699 if (tw
->nswitches
>= tw
->switchsize
) {
700 tw
->switchsize
= tw
->nswitches
* 3 / 2 + 32;
701 tw
->switches
= sresize(tw
->switches
, tw
->switchsize
,
702 struct triewalk_switch
);
705 tw
->switches
[tw
->nswitches
].sw
= sw
;
706 tw
->switches
[tw
->nswitches
].pos
= 1;
707 tw
->switches
[tw
->nswitches
].depth
= depth
;
708 tw
->switches
[tw
->nswitches
].count
= tw
->count
+ sw
->sw
[0].subcount
;
709 off
= sw
->sw
[0].subnode
;
711 buf
[depth
++] = chars
[0];
716 void triewalk_free(triewalk
*tw
)
722 void trie_set_index_offset(void *t
, off_t ptr
)
724 ((struct trie_header
*)t
)->indexroot
= ptr
;
726 off_t
trie_get_index_offset(const void *t
)
728 return ((const struct trie_header
*)t
)->indexroot
;
731 void make_successor(char *pathbuf
)
733 int len
= strlen(pathbuf
);
734 if (len
> 0 && pathbuf
[len
-1] == pathsep
)
736 pathbuf
[len
] = '\001';
737 pathbuf
[len
+1] = '\0';