acce8be97579178db0c385c3418d35fd193d67af
2 * trie.c: implementation of trie.h.
11 #include <sys/types.h>
17 #define alignof(typ) ( offsetof(struct { char c; typ t; }, t) )
22 * Compare functions for pathnames. Returns the relative order of
23 * the names, like strcmp; also passes back the offset of the
24 * first differing character if desired.
26 static int trieccmp(unsigned char a
, unsigned char b
)
28 a
= (a
== '\0' ?
'\0' : a
== pathsep ?
'\1' : a
+1);
29 b
= (b
== '\0' ?
'\0' : b
== pathsep ?
'\1' : b
+1);
30 return (int)a
- (int)b
;
33 static int triencmp(const char *a
, size_t alen
,
34 const char *b
, size_t blen
, int *offset
)
37 while (off
< alen
&& off
< blen
&& a
[off
] == b
[off
])
41 if (off
== alen
|| off
== blen
) return (off
== blen
) - (off
== alen
);
42 return trieccmp(a
[off
], b
[off
]);
45 static int triecmp(const char *a
, const char *b
, int *offset
)
47 return triencmp(a
, strlen(a
), b
, strlen(b
), offset
);
50 /* ----------------------------------------------------------------------
51 * Trie node structures.
53 * The trie format stored in the file consists of three distinct
54 * node types, each with a distinguishing type field at the start.
56 * TRIE_LEAF is a leaf node; it contains an actual trie_file
57 * structure, and indicates that when you're searching down the
58 * trie with a string, you should now expect to encounter
61 * TRIE_SWITCH indicates that the set of strings in the trie
62 * include strings with more than one distinct character after the
63 * prefix leading up to this point. Hence, it stores multiple
64 * subnode pointers and a different single character for each one.
66 * TRIE_STRING indicates that at this point everything in the trie
67 * has the same next few characters; it stores a single mandatory
68 * string fragment and exactly one subnode pointer.
71 TRIE_LEAF
= 0x7fffe000,
80 struct trie_switchentry
{
87 struct trie_file file
;
93 * sw[0] to sw[len-1] give the subnode pointers and element
94 * counts. At &sw[len] is stored len single bytes which are
95 * the characters corresponding to each subnode.
98 struct trie_switchentry sw
[];
102 struct trie_common c
;
110 off_t root
, indexroot
;
116 /* Union only used for computing alignment */
118 struct trie_leaf leaf
;
119 struct { /* fake trie_switch with indeterminate array length filled in */
120 struct trie_common c
;
122 struct trie_switchentry sw
[1];
124 struct { /* fake trie_string with indeterminate array length filled in */
125 struct trie_common c
;
131 #define TRIE_MAGIC 0x75646761UL
132 #define TRIE_ALIGN alignof(union trie_node)
134 /* ----------------------------------------------------------------------
135 * Trie-building functions.
149 int lastlen
, lastsize
;
151 struct tbswitch
*switches
;
156 static void tb_seek(triebuild
*tb
, off_t off
)
159 if (lseek(tb
->fd
, off
, SEEK_SET
) < 0) {
160 fprintf(stderr
, "agedu: lseek: %s\n", strerror(errno
));
165 static void tb_write(triebuild
*tb
, const void *buf
, size_t len
)
169 int ret
= write(tb
->fd
, buf
, len
);
171 fprintf(stderr
, "agedu: write: %s\n", strerror(errno
));
175 buf
= (const void *)((const char *)buf
+ ret
);
179 static char trie_align_zeroes
[TRIE_ALIGN
];
181 static void tb_align(triebuild
*tb
)
183 int off
= (TRIE_ALIGN
- ((tb
)->offset
% TRIE_ALIGN
)) % TRIE_ALIGN
;
184 tb_write(tb
, trie_align_zeroes
, off
);
187 triebuild
*triebuild_new(int fd
)
189 triebuild
*tb
= snew(triebuild
);
190 struct trie_header th
;
194 tb
->lastlen
= tb
->lastsize
= 0;
200 th
.magic
= TRIE_MAGIC
;
201 th
.root
= th
.count
= 0;
204 th
.pathsep
= (unsigned char)pathsep
;
207 tb_write(tb
, &th
, sizeof(th
));
212 static off_t
triebuild_unwind(triebuild
*tb
, int targetdepth
, int *outcount
)
217 if (tb
->lastoff
== 0) {
222 offset
= tb
->lastoff
;
224 depth
= tb
->lastlen
+ 1;
226 assert(depth
>= targetdepth
);
228 while (depth
> targetdepth
) {
230 while (depth
> targetdepth
&&
231 (depth
-1 > tb
->switchsize
|| tb
->switches
[depth
-1].len
== 0))
233 if (odepth
> depth
) {
235 * Write out a string node.
237 size_t nodesize
= sizeof(struct trie_string
) + odepth
- depth
;
238 struct trie_string
*st
= (struct trie_string
*)smalloc(nodesize
);
239 st
->c
.type
= TRIE_STRING
;
240 st
->stringlen
= odepth
- depth
;
241 st
->subnode
= offset
;
242 memcpy(st
->string
, tb
->lastpath
+ depth
, odepth
- depth
);
245 tb_write(tb
, st
, nodesize
);
249 assert(depth
>= targetdepth
);
250 if (depth
<= targetdepth
)
254 * Now we expect to be sitting just below a switch node.
255 * Add our final entry to it and write it out.
259 struct trie_switch
*sw
;
262 int swlen
= tb
->switches
[depth
].len
;
267 tb
->switches
[depth
].c
[swlen
] = tb
->lastpath
[depth
];
268 tb
->switches
[depth
].off
[swlen
] = offset
;
269 tb
->switches
[depth
].count
[swlen
] = count
;
272 nodesize
= sizeof(struct trie_switch
) +
273 swlen
* sizeof(struct trie_switchentry
) + swlen
;
274 sw
= (struct trie_switch
*)smalloc(nodesize
);
275 chars
= (char *)&sw
->sw
[swlen
];
277 sw
->c
.type
= TRIE_SWITCH
;
280 for (i
= 0; i
< swlen
; i
++) {
281 sw
->sw
[i
].subnode
= tb
->switches
[depth
].off
[i
];
282 sw
->sw
[i
].subcount
= tb
->switches
[depth
].count
[i
];
283 chars
[i
] = tb
->switches
[depth
].c
[i
];
285 count
+= tb
->switches
[depth
].count
[i
];
290 tb_write(tb
, sw
, nodesize
);
293 tb
->switches
[depth
].len
= 0; /* clear this node */
301 void triebuild_add(triebuild
*tb
, const char *pathname
,
302 const struct trie_file
*file
)
304 int pathlen
= strlen(pathname
);
307 if (tb
->maxpathlen
< pathlen
+1)
308 tb
->maxpathlen
= pathlen
+1;
315 * Find the first differing character between this pathname
316 * and the previous one.
318 int ret
= triecmp(tb
->lastpath
, pathname
, &depth
);
322 * Finalise all nodes above this depth.
324 offset
= triebuild_unwind(tb
, depth
+1, &count
);
327 * Add the final node we just acquired to the switch node
328 * at our chosen depth, creating it if it isn't already
331 if (tb
->switchsize
<= depth
) {
332 int oldsize
= tb
->switchsize
;
333 tb
->switchsize
= depth
* 3 / 2 + 64;
334 tb
->switches
= sresize(tb
->switches
, tb
->switchsize
,
336 while (oldsize
< tb
->switchsize
)
337 tb
->switches
[oldsize
++].len
= 0;
340 tb
->switches
[depth
].c
[tb
->switches
[depth
].len
] = tb
->lastpath
[depth
];
341 tb
->switches
[depth
].off
[tb
->switches
[depth
].len
] = offset
;
342 tb
->switches
[depth
].count
[tb
->switches
[depth
].len
] = count
;
343 tb
->switches
[depth
].len
++;
347 * Write out a leaf node for the new file, and remember its
351 struct trie_leaf leaf
;
353 leaf
.c
.type
= TRIE_LEAF
;
354 leaf
.file
= *file
; /* structure copy */
357 tb
->lastoff
= tb
->offset
;
358 tb_write(tb
, &leaf
, sizeof(leaf
));
362 * Store this pathname for comparison with the next one.
364 if (tb
->lastsize
< pathlen
+1) {
365 tb
->lastsize
= pathlen
* 3 / 2 + 64;
366 tb
->lastpath
= sresize(tb
->lastpath
, tb
->lastsize
, char);
368 strcpy(tb
->lastpath
, pathname
);
369 tb
->lastlen
= pathlen
;
372 int triebuild_finish(triebuild
*tb
)
374 struct trie_header th
;
376 th
.magic
= TRIE_MAGIC
;
377 th
.root
= triebuild_unwind(tb
, 0, &th
.count
);
379 th
.maxpathlen
= tb
->maxpathlen
;
380 th
.pathsep
= (unsigned char)pathsep
;
383 tb_write(tb
, &th
, sizeof(th
));
388 void triebuild_free(triebuild
*tb
)
395 /* ----------------------------------------------------------------------
396 * Querying functions.
399 #define NODE(t, off, type) \
400 ((const struct type *)((const char *)(t) + (off)))
402 size_t trie_maxpathlen(const void *t
)
404 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
405 return hdr
->maxpathlen
;
408 unsigned long trie_before(const void *t
, const char *pathname
)
410 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
411 int ret
= 0, lastcount
= hdr
->count
;
412 int len
= 1 + strlen(pathname
), depth
= 0;
413 off_t off
= hdr
->root
;
416 const struct trie_common
*node
= NODE(t
, off
, trie_common
);
417 if (node
->type
== TRIE_LEAF
) {
419 ret
+= lastcount
; /* _shouldn't_ happen, but in principle */
421 } else if (node
->type
== TRIE_STRING
) {
422 const struct trie_string
*st
= NODE(t
, off
, trie_string
);
425 int cmp
= triencmp(st
->string
, st
->stringlen
,
426 pathname
+ depth
, len
-depth
, &offset
);
428 if (offset
< st
->stringlen
) {
434 depth
+= st
->stringlen
;
436 } else if (node
->type
== TRIE_SWITCH
) {
437 const struct trie_switch
*sw
= NODE(t
, off
, trie_switch
);
438 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
441 for (i
= 0; i
< sw
->len
; i
++) {
443 int cmp
= trieccmp(pathname
[depth
], c
);
445 ret
+= sw
->sw
[i
].subcount
;
449 off
= sw
->sw
[i
].subnode
;
450 lastcount
= sw
->sw
[i
].subcount
;
461 void trie_getpath(const void *t
, unsigned long n
, char *buf
)
463 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
465 off_t off
= hdr
->root
;
468 const struct trie_common
*node
= NODE(t
, off
, trie_common
);
469 if (node
->type
== TRIE_LEAF
) {
470 assert(depth
> 0 && buf
[depth
-1] == '\0');
472 } else if (node
->type
== TRIE_STRING
) {
473 const struct trie_string
*st
= NODE(t
, off
, trie_string
);
475 memcpy(buf
+ depth
, st
->string
, st
->stringlen
);
476 depth
+= st
->stringlen
;
478 } else if (node
->type
== TRIE_SWITCH
) {
479 const struct trie_switch
*sw
= NODE(t
, off
, trie_switch
);
480 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
483 for (i
= 0; i
< sw
->len
; i
++) {
484 if (n
< sw
->sw
[i
].subcount
) {
485 buf
[depth
++] = chars
[i
];
486 off
= sw
->sw
[i
].subnode
;
489 n
-= sw
->sw
[i
].subcount
;
496 unsigned long trie_count(const void *t
)
498 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
502 char trie_pathsep(const void *t
)
504 const struct trie_header
*hdr
= NODE(t
, 0, trie_header
);
505 return (char)hdr
->pathsep
;
508 struct triewalk_switch
{
509 const struct trie_switch
*sw
;
510 int pos
, depth
, count
;
514 struct triewalk_switch
*switches
;
515 int nswitches
, switchsize
;
518 triewalk
*triewalk_new(const void *vt
)
520 triewalk
*tw
= snew(triewalk
);
522 tw
->t
= (const char *)vt
;
530 const struct trie_file
*triewalk_next(triewalk
*tw
, char *buf
)
535 if (tw
->nswitches
< 0) {
536 const struct trie_header
*hdr
= NODE(tw
->t
, 0, trie_header
);
543 const struct trie_switch
*sw
;
546 if (tw
->nswitches
== 0) {
547 assert(tw
->count
== NODE(tw
->t
, 0, trie_header
)->count
);
548 return NULL
; /* run out of trie */
551 swpos
= tw
->switches
[tw
->nswitches
-1].pos
;
552 sw
= tw
->switches
[tw
->nswitches
-1].sw
;
553 chars
= (const char *)&sw
->sw
[sw
->len
];
555 if (swpos
< sw
->len
) {
556 depth
= tw
->switches
[tw
->nswitches
-1].depth
;
557 off
= sw
->sw
[swpos
].subnode
;
559 buf
[depth
++] = chars
[swpos
];
560 assert(tw
->count
== tw
->switches
[tw
->nswitches
-1].count
);
561 tw
->switches
[tw
->nswitches
-1].count
+= sw
->sw
[swpos
].subcount
;
562 tw
->switches
[tw
->nswitches
-1].pos
++;
571 const struct trie_common
*node
= NODE(tw
->t
, off
, trie_common
);
572 if (node
->type
== TRIE_LEAF
) {
573 const struct trie_leaf
*lf
= NODE(tw
->t
, off
, trie_leaf
);
575 assert(depth
> 0 && buf
[depth
-1] == '\0');
578 } else if (node
->type
== TRIE_STRING
) {
579 const struct trie_string
*st
= NODE(tw
->t
, off
, trie_string
);
582 memcpy(buf
+ depth
, st
->string
, st
->stringlen
);
583 depth
+= st
->stringlen
;
585 } else if (node
->type
== TRIE_SWITCH
) {
586 const struct trie_switch
*sw
= NODE(tw
->t
, off
, trie_switch
);
587 const char *chars
= (const char *)&sw
->sw
[sw
->len
];
589 if (tw
->nswitches
>= tw
->switchsize
) {
590 tw
->switchsize
= tw
->nswitches
* 3 / 2 + 32;
591 tw
->switches
= sresize(tw
->switches
, tw
->switchsize
,
592 struct triewalk_switch
);
595 tw
->switches
[tw
->nswitches
].sw
= sw
;
596 tw
->switches
[tw
->nswitches
].pos
= 1;
597 tw
->switches
[tw
->nswitches
].depth
= depth
;
598 tw
->switches
[tw
->nswitches
].count
= tw
->count
+ sw
->sw
[0].subcount
;
599 off
= sw
->sw
[0].subnode
;
601 buf
[depth
++] = chars
[0];
606 void triewalk_free(triewalk
*tw
)
612 void trie_set_index_offset(void *t
, off_t ptr
)
614 ((struct trie_header
*)t
)->indexroot
= ptr
;
616 off_t
trie_get_index_offset(const void *t
)
618 return ((const struct trie_header
*)t
)->indexroot
;