2 * trie.c: implementation of trie.h.
9 #define alignof(typ) ( offsetof(struct { char c; typ t; }, t) )
12 * Compare functions for pathnames. Returns the relative order of
13 * the names, like strcmp; also passes back the offset of the
14 * first differing character if desired.
16 static int trieccmp(unsigned char a
, unsigned char b
)
18 int ia
= (a
== '\0' ?
'\0' : a
== pathsep ?
'\1' : a
+1);
19 int ib
= (b
== '\0' ?
'\0' : b
== pathsep ?
'\1' : b
+1);
23 static int triencmp(const char *a
, size_t alen
,
24 const char *b
, size_t blen
, int *offset
)
27 while (off
< alen
&& off
< blen
&& a
[off
] == b
[off
])
31 if (off
== alen
|| off
== blen
) return (off
== blen
) - (off
== alen
);
32 return trieccmp(a
[off
], b
[off
]);
35 static int triecmp(const char *a
, const char *b
, int *offset
)
37 return triencmp(a
, strlen(a
), b
, strlen(b
), offset
);
40 /* ----------------------------------------------------------------------
41 * Trie node structures.
43 * The trie format stored in the file consists of three distinct
44 * node types, each with a distinguishing type field at the start.
46 * TRIE_LEAF is a leaf node; it contains an actual trie_file
47 * structure, and indicates that when you're searching down the
48 * trie with a string, you should now expect to encounter
51 * TRIE_SWITCH indicates that the set of strings in the trie
52 * include strings with more than one distinct character after the
53 * prefix leading up to this point. Hence, it stores multiple
54 * subnode pointers and a different single character for each one.
56 * TRIE_STRING indicates that at this point everything in the trie
57 * has the same next few characters; it stores a single mandatory
58 * string fragment and exactly one subnode pointer.
61 TRIE_LEAF
= 0x7fffe000,
70 struct trie_switchentry
{
77 struct trie_file file
;
83 * sw[0] to sw[len-1] give the subnode pointers and element
84 * counts. At &sw[len] is stored len single bytes which are
85 * the characters corresponding to each subnode.
88 struct trie_switchentry sw
[];
98 static const char magic_ident_string
[16] = "agedu index file";
101 * 'Magic numbers' to go at the start of an agedu index file.
102 * These are checked (using trie_check_magic) by every agedu mode
103 * which reads a pre-existing index.
105 * As well as identifying an agedu file from any other kind of
106 * file, this magic-number structure is also intended to detect
107 * agedu files which were written on the wrong platform and hence
108 * whose structure layouts are incompatible. To make that as
109 * reliable as possible, I design the structure of magic numbers
110 * as follows: it contains one of each integer type I might use,
111 * each containing a different magic number, and each followed by
112 * a char to indicate where it ends in the file. One byte is set
113 * to the length of the magic-number structure itself, which means
114 * that no two structures of different lengths can possibly
115 * compare equal even if by some chance they match up to the
116 * length of the shorter one. Finally, the whole magic number
117 * structure is memset to another random byte value before
118 * initialising any of these fields, so that padding in between
119 * can be readily identified.
121 char ident
[16]; /* human-readable string */
123 unsigned char magic_len
;
125 unsigned long long longlong_magic
;
126 unsigned char postlonglong_char_magic
;
129 unsigned char postoffset_char_magic
;
132 unsigned char postsize_char_magic
;
135 unsigned char postptr_char_magic
;
137 unsigned long long_magic
;
138 unsigned char postlong_char_magic
;
140 unsigned short short_magic
;
141 unsigned char postshort_char_magic
;
145 struct trie_magic magic
;
146 off_t root
, indexroot
;
152 /* Union only used for computing alignment */
154 struct trie_leaf leaf
;
155 struct { /* fake trie_switch with indeterminate array length filled in */
156 struct trie_common c
;
158 struct trie_switchentry sw
[1];
160 struct { /* fake trie_string with indeterminate array length filled in */
161 struct trie_common c
;
167 #define TRIE_ALIGN alignof(union trie_node)
169 static void setup_magic(struct trie_magic
*th
)
172 * Magic values are chosen so that every byte value used is
173 * distinct (so that we can't fail to spot endianness issues), and
174 * we cast 64 bits of data into each integer type just in case
175 * the platform makes it longer than we expect it to be.
178 memset(th
, 0xCDU
, sizeof(*th
));
180 th
->magic_len
= sizeof(*th
);
182 memcpy(th
->ident
, magic_ident_string
, 16);
184 th
->longlong_magic
= 0x5583F34D5D84F73CULL
;
185 th
->postlonglong_char_magic
= 0xDDU
;
187 th
->offset_magic
= (off_t
)0xB39BF9AD56D48E0BULL
;
188 th
->postoffset_char_magic
= 0x95U
;
190 th
->size_magic
= (size_t)0x6EC752B0EEAEBAC1ULL
;
191 th
->postsize_char_magic
= 0x77U
;
193 th
->null_pointer
= NULL
;
194 th
->postptr_char_magic
= 0x71U
;
196 th
->long_magic
= (unsigned long)0xA81A5E1F44334716ULL
;
197 th
->postlong_char_magic
= 0x99U
;
199 th
->short_magic
= (unsigned short)0x0C8BD7984B68D9FCULL
;
200 th
->postshort_char_magic
= 0x35U
;
203 /* ----------------------------------------------------------------------
204 * Trie-building functions.
218 int lastlen
, lastsize
;
220 struct tbswitch
*switches
;
225 static void tb_seek(triebuild
*tb
, off_t off
)
228 if (lseek(tb
->fd
, off
, SEEK_SET
) < 0) {
229 fprintf(stderr
, PNAME
": lseek: %s\n", strerror(errno
));
234 static void tb_write(triebuild
*tb
, const void *buf
, size_t len
)
238 int ret
= write(tb
->fd
, buf
, len
);
240 fprintf(stderr
, PNAME
": write: %s\n", strerror(errno
));
244 buf
= (const void *)((const char *)buf
+ ret
);
248 static char trie_align_zeroes
[TRIE_ALIGN
];
250 static void tb_align(triebuild
*tb
)
252 int off
= (TRIE_ALIGN
- ((tb
)->offset
% TRIE_ALIGN
)) % TRIE_ALIGN
;
253 tb_write(tb
, trie_align_zeroes
, off
);
256 triebuild
*triebuild_new(int fd
)
258 triebuild
*tb
= snew(triebuild
);
259 struct trie_header th
;
263 tb
->lastlen
= tb
->lastsize
= 0;
269 setup_magic(&th
.magic
);
270 th
.root
= th
.count
= 0;
273 th
.pathsep
= (unsigned char)pathsep
;
276 tb_write(tb
, &th
, sizeof(th
));
281 static off_t
triebuild_unwind(triebuild
*tb
, int targetdepth
, int *outcount
)
286 if (tb
->lastoff
== 0) {
291 offset
= tb
->lastoff
;
293 depth
= tb
->lastlen
+ 1;
295 assert(depth
>= targetdepth
);
297 while (depth
> targetdepth
) {
299 while (depth
> targetdepth
&&
300 (depth
-1 >= tb
->switchsize
|| !tb
->switches
||
301 tb
->switches
[depth
-1].len
== 0))
303 if (odepth
> depth
) {
305 * Write out a string node.
307 size_t nodesize
= sizeof(struct trie_string
) + odepth
- depth
;
308 struct trie_string
*st
= (struct trie_string
*)smalloc(nodesize
);
309 st
->c
.type
= TRIE_STRING
;
310 st
->stringlen
= odepth
- depth
;
311 st
->subnode
= offset
;
312 memcpy(st
->string
, tb
->lastpath
+ depth
, odepth
- depth
);
315 tb_write(tb
, st
, nodesize
);
319 assert(depth
>= targetdepth
);
320 if (depth
<= targetdepth
)
324 * Now we expect to be sitting just below a switch node.
325 * Add our final entry to it and write it out.
329 struct trie_switch
*sw
;
332 int swlen
= tb
->switches
[depth
].len
;
337 tb
->switches
[depth
].c
[swlen
] = tb
->lastpath
[depth
];
338 tb
->switches
[depth
].off
[swlen
] = offset
;
339 tb
->switches
[depth
].count
[swlen
] = count
;
342 nodesize
= sizeof(struct trie_switch
) +
343 swlen
* sizeof(struct trie_switchentry
) + swlen
;
344 sw
= (struct trie_switch
*)smalloc(nodesize
);
345 chars
= (char *)&sw
->sw
[swlen
];
347 sw
->c
.type
= TRIE_SWITCH
;
350 for (i
= 0; i
< swlen
; i
++) {
351 sw
->sw
[i
].subnode
= tb
->switches
[depth
].off
[i
];
352 sw
->sw
[i
].subcount
= tb
->switches
[depth
].count
[i
];
353 chars
[i
] = tb
->switches
[depth
].c
[i
];
355 count
+= tb
->switches
[depth
].count
[i
];
360 tb_write(tb
, sw
, nodesize
);
363 tb
->switches
[depth
].len
= 0; /* clear this node */
371 void triebuild_add(triebuild
*tb
, const char *pathname
,
372 const struct trie_file
*file
)
374 int pathlen
= strlen(pathname
);
377 if (tb
->maxpathlen
< pathlen
+1)
378 tb
->maxpathlen
= pathlen
+1;
385 * Find the first differing character between this pathname
386 * and the previous one.
388 int ret
= triecmp(tb
->lastpath
, pathname
, &depth
);
392 * Finalise all nodes above this depth.
394 offset
= triebuild_unwind(tb
, depth
+1, &count
);
397 * Add the final node we just acquired to the switch node
398 * at our chosen depth, creating it if it isn't already
401 if (tb
->switchsize
<= depth
) {
402 int oldsize
= tb
->switchsize
;
403 tb
->switchsize
= depth
* 3 / 2 + 64;
404 tb
->switches
= sresize(tb
->switches
, tb
->switchsize
,
406 while (oldsize
< tb
->switchsize
)
407 tb
->switches
[oldsize
++].len
= 0;
410 tb
->switches
[depth
].c
[tb
->switches
[depth
].len
] = tb
->lastpath
[depth
];
411 tb
->switches
[depth
].off
[tb
->switches
[depth
].len
] = offset
;
412 tb
->switches
[depth
].count
[tb
->switches
[depth
].len
] = count
;
413 tb
->switches
[depth
].len
++;
417 * Write out a leaf node for the new file, and remember its
421 struct trie_leaf leaf
;
423 leaf
.c
.type
= TRIE_LEAF
;
424 leaf
.file
= *file
; /* structure copy */
427 tb
->lastoff
= tb
->offset
;
428 tb_write(tb
, &leaf
, sizeof(leaf
));
432 * Store this pathname for comparison with the next one.
434 if (tb
->lastsize
< pathlen
+1) {
435 tb
->lastsize
= pathlen
* 3 / 2 + 64;
436 tb
->lastpath
= sresize(tb
->lastpath
, tb
->lastsize
, char);
438 strcpy(tb
->lastpath
, pathname
);
439 tb
->lastlen
= pathlen
;
442 int triebuild_finish(triebuild
*tb
)
444 struct trie_header th
;
446 setup_magic(&th
.magic
);
447 th
.root
= triebuild_unwind(tb
, 0, &th
.count
);
449 th
.maxpathlen
= tb
->maxpathlen
;
450 th
.pathsep
= (unsigned char)pathsep
;
453 tb_write(tb
, &th
, sizeof(th
));
458 void triebuild_free(triebuild
*tb
)
465 /* ----------------------------------------------------------------------
466 * Memory-mapped trie modification.
469 #define MNODE(t, off, type) \
470 ((struct type *)((char *)(t) + (off)))
472 static unsigned long long fake_atime_recurse(void *t
, struct trie_common
*node
,
473 int last_seen_pathsep
)
475 while (node
->type
== TRIE_STRING
) {
476 struct trie_string
*st
= (struct trie_string
*)node
;
477 last_seen_pathsep
= (st
->string
[st
->stringlen
-1] == pathsep
);
478 node
= MNODE(t
, st
->subnode
, trie_common
);
481 if (node
->type
== TRIE_LEAF
) {
482 struct trie_leaf
*leaf
= (struct trie_leaf
*)node
;
483 return leaf
->file
.atime
;
484 } else if (assert(node
->type
== TRIE_SWITCH
), 1) {
485 struct trie_switch
*sw
= (struct trie_switch
*)node
;
486 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
487 unsigned long long max
= 0, subdir
, ret
;
489 int slashindex
= -1, bareindex
= -1;
492 * First, process all the children of this node whose
493 * switch characters are not \0 or pathsep. We do this in
494 * reverse order so as to maintain best cache locality
495 * (tracking generally backwards through the file), though
496 * it doesn't matter semantically.
498 * For each of these children, we're just recursing into
499 * it to do any fixups required below it, and amalgamating
500 * the max atimes we get back.
502 for (i
= sw
->len
; i
-- > 0 ;) {
503 if (chars
[i
] == '\0') {
505 } else if (chars
[i
] == pathsep
) {
508 ret
= fake_atime_recurse(t
, MNODE(t
, sw
->sw
[i
].subnode
,
516 * Now we have at most two child nodes left to deal with:
517 * one with a slash (or general pathsep) and one with \0.
519 * If there's a slash node and a bare node, then the slash
520 * node contains precisely everything inside the directory
521 * described by the bare node; so we must retrieve the max
522 * atime for the slash node and use it to fix up the bare
525 * If there's only a bare node but the pathname leading up
526 * to this point ends in a slash, then _all_ of the child
527 * nodes of this node contain stuff inside the directory
528 * described by the bare node; so we use the whole of the
529 * maximum value we've computed so far to update the bare
532 if (slashindex
>= 0) {
533 ret
= fake_atime_recurse(t
, MNODE(t
, sw
->sw
[slashindex
].subnode
,
539 } else if (last_seen_pathsep
) {
542 /* Don't update the bare subnode at all. */
546 if (bareindex
>= 0) {
547 struct trie_leaf
*leaf
;
549 leaf
= MNODE(t
, sw
->sw
[bareindex
].subnode
, trie_leaf
);
551 if (leaf
&& leaf
->c
.type
== TRIE_LEAF
) {
552 if (leaf
->file
.atime
< subdir
)
553 leaf
->file
.atime
= subdir
;
554 ret
= leaf
->file
.atime
;
556 /* Shouldn't really happen, but be cautious anyway */
557 ret
= fake_atime_recurse(t
, &leaf
->c
, 0);
566 return 0; /* placate lint */
569 void trie_fake_dir_atimes(void *t
)
571 struct trie_header
*hdr
= MNODE(t
, 0, trie_header
);
572 struct trie_common
*node
= MNODE(t
, hdr
->root
, trie_common
);
574 fake_atime_recurse(t
, node
, 1);
577 /* ----------------------------------------------------------------------
578 * Querying functions.
581 #define NODE(t, off, type) \
582 ((const struct type *)((const char *)(t) + (off)))
584 int trie_check_magic(const void *t
)
586 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
587 struct trie_magic magic
;
590 return !memcmp(&magic
, &hdr
->magic
, sizeof(magic
));
593 size_t trie_maxpathlen(const void *t
)
595 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
596 return hdr
->maxpathlen
;
599 unsigned long trie_before(const void *t
, const char *pathname
)
601 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
602 int ret
= 0, lastcount
= hdr
->count
;
603 int len
= 1 + strlen(pathname
), depth
= 0;
604 off_t off
= hdr
->root
;
607 const struct trie_common
*node
= NODE(t
, off
, trie_common
);
608 if (node
->type
== TRIE_LEAF
) {
610 ret
+= lastcount
; /* _shouldn't_ happen, but in principle */
612 } else if (node
->type
== TRIE_STRING
) {
613 const struct trie_string
*st
= NODE(t
, off
, trie_string
);
616 int cmp
= triencmp(st
->string
, st
->stringlen
,
617 pathname
+ depth
, len
-depth
, &offset
);
619 if (offset
< st
->stringlen
) {
625 depth
+= st
->stringlen
;
627 } else if (node
->type
== TRIE_SWITCH
) {
628 const struct trie_switch
*sw
= NODE(t
, off
, trie_switch
);
629 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
632 for (i
= 0; i
< sw
->len
; i
++) {
634 int cmp
= trieccmp(pathname
[depth
], c
);
636 ret
+= sw
->sw
[i
].subcount
;
640 off
= sw
->sw
[i
].subnode
;
641 lastcount
= sw
->sw
[i
].subcount
;
652 void trie_getpath(const void *t
, unsigned long n
, char *buf
)
654 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
656 off_t off
= hdr
->root
;
659 const struct trie_common
*node
= NODE(t
, off
, trie_common
);
660 if (node
->type
== TRIE_LEAF
) {
661 assert(depth
> 0 && buf
[depth
-1] == '\0');
663 } else if (node
->type
== TRIE_STRING
) {
664 const struct trie_string
*st
= NODE(t
, off
, trie_string
);
666 memcpy(buf
+ depth
, st
->string
, st
->stringlen
);
667 depth
+= st
->stringlen
;
669 } else if (node
->type
== TRIE_SWITCH
) {
670 const struct trie_switch
*sw
= NODE(t
, off
, trie_switch
);
671 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
674 for (i
= 0; i
< sw
->len
; i
++) {
675 if (n
< sw
->sw
[i
].subcount
) {
676 buf
[depth
++] = chars
[i
];
677 off
= sw
->sw
[i
].subnode
;
680 n
-= sw
->sw
[i
].subcount
;
687 const struct trie_file
*trie_getfile(const void *t
, unsigned long n
)
689 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
690 off_t off
= hdr
->root
;
693 const struct trie_common
*node
= NODE(t
, off
, trie_common
);
694 if (node
->type
== TRIE_LEAF
) {
695 const struct trie_leaf
*leaf
= NODE(t
, off
, trie_leaf
);
697 } else if (node
->type
== TRIE_STRING
) {
698 const struct trie_string
*st
= NODE(t
, off
, trie_string
);
700 } else if (node
->type
== TRIE_SWITCH
) {
701 const struct trie_switch
*sw
= NODE(t
, off
, trie_switch
);
704 for (i
= 0; i
< sw
->len
; i
++) {
705 if (n
< sw
->sw
[i
].subcount
) {
706 off
= sw
->sw
[i
].subnode
;
709 n
-= sw
->sw
[i
].subcount
;
716 unsigned long trie_count(const void *t
)
718 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
722 char trie_pathsep(const void *t
)
724 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
725 return (char)hdr
->pathsep
;
728 struct triewalk_switch
{
729 const struct trie_switch
*sw
;
730 int pos
, depth
, count
;
734 struct triewalk_switch
*switches
;
735 int nswitches
, switchsize
;
738 triewalk
*triewalk_new(const void *vt
)
740 triewalk
*tw
= snew(triewalk
);
742 tw
->t
= (const char *)vt
;
751 void triewalk_rebase(triewalk
*tw
, const void *t
)
753 ptrdiff_t diff
= ((const unsigned char *)t
- (const unsigned char *)(tw
->t
));
758 for (i
= 0; i
< tw
->nswitches
; i
++)
759 tw
->switches
[i
].sw
= (const struct trie_switch
*)
760 ((const unsigned char *)(tw
->switches
[i
].sw
) + diff
);
763 const struct trie_file
*triewalk_next(triewalk
*tw
, char *buf
)
768 if (tw
->nswitches
< 0) {
769 const struct trie_header
*hdr
= NODE(tw
->t
, 0, trie_header
);
776 const struct trie_switch
*sw
;
779 if (tw
->nswitches
== 0) {
780 assert(tw
->count
== NODE(tw
->t
, 0, trie_header
)->count
);
781 return NULL
; /* run out of trie */
784 swpos
= tw
->switches
[tw
->nswitches
-1].pos
;
785 sw
= tw
->switches
[tw
->nswitches
-1].sw
;
786 chars
= (const char *)&sw
->sw
[sw
->len
];
788 if (swpos
< sw
->len
) {
789 depth
= tw
->switches
[tw
->nswitches
-1].depth
;
790 off
= sw
->sw
[swpos
].subnode
;
792 buf
[depth
++] = chars
[swpos
];
793 assert(tw
->count
== tw
->switches
[tw
->nswitches
-1].count
);
794 tw
->switches
[tw
->nswitches
-1].count
+= sw
->sw
[swpos
].subcount
;
795 tw
->switches
[tw
->nswitches
-1].pos
++;
804 const struct trie_common
*node
= NODE(tw
->t
, off
, trie_common
);
805 if (node
->type
== TRIE_LEAF
) {
806 const struct trie_leaf
*lf
= NODE(tw
->t
, off
, trie_leaf
);
808 assert(depth
> 0 && buf
[depth
-1] == '\0');
811 } else if (node
->type
== TRIE_STRING
) {
812 const struct trie_string
*st
= NODE(tw
->t
, off
, trie_string
);
815 memcpy(buf
+ depth
, st
->string
, st
->stringlen
);
816 depth
+= st
->stringlen
;
818 } else if (node
->type
== TRIE_SWITCH
) {
819 const struct trie_switch
*sw
= NODE(tw
->t
, off
, trie_switch
);
820 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
822 if (tw
->nswitches
>= tw
->switchsize
) {
823 tw
->switchsize
= tw
->nswitches
* 3 / 2 + 32;
824 tw
->switches
= sresize(tw
->switches
, tw
->switchsize
,
825 struct triewalk_switch
);
828 tw
->switches
[tw
->nswitches
].sw
= sw
;
829 tw
->switches
[tw
->nswitches
].pos
= 1;
830 tw
->switches
[tw
->nswitches
].depth
= depth
;
831 tw
->switches
[tw
->nswitches
].count
= tw
->count
+ sw
->sw
[0].subcount
;
832 off
= sw
->sw
[0].subnode
;
834 buf
[depth
++] = chars
[0];
839 void triewalk_free(triewalk
*tw
)
845 void trie_set_index_offset(void *t
, off_t ptr
)
847 ((struct trie_header
*)t
)->indexroot
= ptr
;
849 off_t
trie_get_index_offset(const void *t
)
851 return ((const struct trie_header
*)t
)->indexroot
;
854 void make_successor(char *pathbuf
)
856 int len
= strlen(pathbuf
);
857 if (len
> 0 && pathbuf
[len
-1] == pathsep
)
859 pathbuf
[len
] = '\001';
860 pathbuf
[len
+1] = '\0';