Change the index file format to explicitly indicate the appropriate
[sgt/agedu] / trie.c
1 /*
2 * trie.c: implementation of trie.h.
3 */
4
5 #include <assert.h>
6 #include <stdio.h>
7 #include <string.h>
8 #include <stdlib.h>
9 #include <errno.h>
10
11 #include <sys/types.h>
12 #include <unistd.h>
13
14 #include "malloc.h"
15 #include "trie.h"
16
17 #define alignof(typ) ( offsetof(struct { char c; typ t; }, t) )
18
19 extern char pathsep;
20
21 /*
22 * Compare functions for pathnames. Returns the relative order of
23 * the names, like strcmp; also passes back the offset of the
24 * first differing character if desired.
25 */
26 static int trieccmp(unsigned char a, unsigned char b)
27 {
28 a = (a == '\0' ? '\0' : a == pathsep ? '\1' : a+1);
29 b = (b == '\0' ? '\0' : b == pathsep ? '\1' : b+1);
30 return (int)a - (int)b;
31 }
32
33 static int triencmp(const char *a, size_t alen,
34 const char *b, size_t blen, int *offset)
35 {
36 int off = 0;
37 while (off < alen && off < blen && a[off] == b[off])
38 off++;
39 if (offset)
40 *offset = off;
41 if (off == alen || off == blen) return (off == blen) - (off == alen);
42 return trieccmp(a[off], b[off]);
43 }
44
45 static int triecmp(const char *a, const char *b, int *offset)
46 {
47 return triencmp(a, strlen(a), b, strlen(b), offset);
48 }
49
50 /* ----------------------------------------------------------------------
51 * Trie node structures.
52 *
53 * The trie format stored in the file consists of three distinct
54 * node types, each with a distinguishing type field at the start.
55 *
56 * TRIE_LEAF is a leaf node; it contains an actual trie_file
57 * structure, and indicates that when you're searching down the
58 * trie with a string, you should now expect to encounter
59 * end-of-string.
60 *
61 * TRIE_SWITCH indicates that the set of strings in the trie
62 * include strings with more than one distinct character after the
63 * prefix leading up to this point. Hence, it stores multiple
64 * subnode pointers and a different single character for each one.
65 *
66 * TRIE_STRING indicates that at this point everything in the trie
67 * has the same next few characters; it stores a single mandatory
68 * string fragment and exactly one subnode pointer.
69 */
70 enum {
71 TRIE_LEAF = 0x7fffe000,
72 TRIE_SWITCH,
73 TRIE_STRING
74 };
75
76 struct trie_common {
77 int type;
78 };
79
80 struct trie_switchentry {
81 off_t subnode;
82 int subcount;
83 };
84
85 struct trie_leaf {
86 struct trie_common c;
87 struct trie_file file;
88 };
89
90 struct trie_switch {
91 struct trie_common c;
92 /*
93 * sw[0] to sw[len-1] give the subnode pointers and element
94 * counts. At &sw[len] is stored len single bytes which are
95 * the characters corresponding to each subnode.
96 */
97 int len;
98 struct trie_switchentry sw[];
99 };
100
101 struct trie_string {
102 struct trie_common c;
103 int stringlen;
104 off_t subnode;
105 char string[];
106 };
107
108 struct trie_header {
109 unsigned long magic;
110 off_t root, indexroot;
111 int count;
112 size_t maxpathlen;
113 int pathsep;
114 };
115
116 /* Union only used for computing alignment */
117 union trie_node {
118 struct trie_leaf leaf;
119 struct { /* fake trie_switch with indeterminate array length filled in */
120 struct trie_common c;
121 int len;
122 struct trie_switchentry sw[1];
123 } sw;
124 struct { /* fake trie_string with indeterminate array length filled in */
125 struct trie_common c;
126 int stringlen;
127 off_t subnode;
128 char string[1];
129 } str;
130 };
131 #define TRIE_MAGIC 0x75646761UL
132 #define TRIE_ALIGN alignof(union trie_node)
133
134 /* ----------------------------------------------------------------------
135 * Trie-building functions.
136 */
137
138 struct tbswitch {
139 int len;
140 char c[256];
141 off_t off[256];
142 int count[256];
143 };
144
145 struct triebuild {
146 int fd;
147 off_t offset;
148 char *lastpath;
149 int lastlen, lastsize;
150 off_t lastoff;
151 struct tbswitch *switches;
152 int switchsize;
153 size_t maxpathlen;
154 };
155
156 static void tb_seek(triebuild *tb, off_t off)
157 {
158 tb->offset = off;
159 if (lseek(tb->fd, off, SEEK_SET) < 0) {
160 fprintf(stderr, "agedu: lseek: %s\n", strerror(errno));
161 exit(1);
162 }
163 }
164
165 static void tb_write(triebuild *tb, const void *buf, size_t len)
166 {
167 tb->offset += len;
168 while (len > 0) {
169 int ret = write(tb->fd, buf, len);
170 if (ret < 0) {
171 fprintf(stderr, "agedu: write: %s\n", strerror(errno));
172 exit(1);
173 }
174 len -= ret;
175 buf = (const void *)((const char *)buf + ret);
176 }
177 }
178
179 static char trie_align_zeroes[TRIE_ALIGN];
180
181 static void tb_align(triebuild *tb)
182 {
183 int off = (TRIE_ALIGN - ((tb)->offset % TRIE_ALIGN)) % TRIE_ALIGN;
184 tb_write(tb, trie_align_zeroes, off);
185 }
186
187 triebuild *triebuild_new(int fd)
188 {
189 triebuild *tb = snew(triebuild);
190 struct trie_header th;
191
192 tb->fd = fd;
193 tb->lastpath = NULL;
194 tb->lastlen = tb->lastsize = 0;
195 tb->lastoff = 0;
196 tb->switches = NULL;
197 tb->switchsize = 0;
198 tb->maxpathlen = 0;
199
200 th.magic = TRIE_MAGIC;
201 th.root = th.count = 0;
202 th.indexroot = 0;
203 th.maxpathlen = 0;
204 th.pathsep = (unsigned char)pathsep;
205
206 tb_seek(tb, 0);
207 tb_write(tb, &th, sizeof(th));
208
209 return tb;
210 }
211
212 static off_t triebuild_unwind(triebuild *tb, int targetdepth, int *outcount)
213 {
214 off_t offset;
215 int count, depth;
216
217 if (tb->lastoff == 0) {
218 *outcount = 0;
219 return 0;
220 }
221
222 offset = tb->lastoff;
223 count = 1;
224 depth = tb->lastlen + 1;
225
226 assert(depth >= targetdepth);
227
228 while (depth > targetdepth) {
229 int odepth = depth;
230 while (depth > targetdepth &&
231 (depth-1 > tb->switchsize || tb->switches[depth-1].len == 0))
232 depth--;
233 if (odepth > depth) {
234 /*
235 * Write out a string node.
236 */
237 size_t nodesize = sizeof(struct trie_string) + odepth - depth;
238 struct trie_string *st = (struct trie_string *)smalloc(nodesize);
239 st->c.type = TRIE_STRING;
240 st->stringlen = odepth - depth;
241 st->subnode = offset;
242 memcpy(st->string, tb->lastpath + depth, odepth - depth);
243 tb_align(tb);
244 offset = tb->offset;
245 tb_write(tb, st, nodesize);
246 sfree(st);
247 }
248
249 assert(depth >= targetdepth);
250 if (depth <= targetdepth)
251 break;
252
253 /*
254 * Now we expect to be sitting just below a switch node.
255 * Add our final entry to it and write it out.
256 */
257 depth--;
258 {
259 struct trie_switch *sw;
260 char *chars;
261 size_t nodesize;
262 int swlen = tb->switches[depth].len;
263 int i;
264
265 assert(swlen > 0);
266
267 tb->switches[depth].c[swlen] = tb->lastpath[depth];
268 tb->switches[depth].off[swlen] = offset;
269 tb->switches[depth].count[swlen] = count;
270 swlen++;
271
272 nodesize = sizeof(struct trie_switch) +
273 swlen * sizeof(struct trie_switchentry) + swlen;
274 sw = (struct trie_switch *)smalloc(nodesize);
275 chars = (char *)&sw->sw[swlen];
276
277 sw->c.type = TRIE_SWITCH;
278 sw->len = swlen;
279 count = 0;
280 for (i = 0; i < swlen; i++) {
281 sw->sw[i].subnode = tb->switches[depth].off[i];
282 sw->sw[i].subcount = tb->switches[depth].count[i];
283 chars[i] = tb->switches[depth].c[i];
284
285 count += tb->switches[depth].count[i];
286 }
287
288 tb_align(tb);
289 offset = tb->offset;
290 tb_write(tb, sw, nodesize);
291 sfree(sw);
292
293 tb->switches[depth].len = 0; /* clear this node */
294 }
295 }
296
297 *outcount = count;
298 return offset;
299 }
300
301 void triebuild_add(triebuild *tb, const char *pathname,
302 const struct trie_file *file)
303 {
304 int pathlen = strlen(pathname);
305 int depth;
306
307 if (tb->maxpathlen < pathlen+1)
308 tb->maxpathlen = pathlen+1;
309
310 if (tb->lastpath) {
311 off_t offset;
312 int count;
313
314 /*
315 * Find the first differing character between this pathname
316 * and the previous one.
317 */
318 int ret = triecmp(tb->lastpath, pathname, &depth);
319 assert(ret < 0);
320
321 /*
322 * Finalise all nodes above this depth.
323 */
324 offset = triebuild_unwind(tb, depth+1, &count);
325
326 /*
327 * Add the final node we just acquired to the switch node
328 * at our chosen depth, creating it if it isn't already
329 * there.
330 */
331 if (tb->switchsize <= depth) {
332 int oldsize = tb->switchsize;
333 tb->switchsize = depth * 3 / 2 + 64;
334 tb->switches = sresize(tb->switches, tb->switchsize,
335 struct tbswitch);
336 while (oldsize < tb->switchsize)
337 tb->switches[oldsize++].len = 0;
338 }
339
340 tb->switches[depth].c[tb->switches[depth].len] = tb->lastpath[depth];
341 tb->switches[depth].off[tb->switches[depth].len] = offset;
342 tb->switches[depth].count[tb->switches[depth].len] = count;
343 tb->switches[depth].len++;
344 }
345
346 /*
347 * Write out a leaf node for the new file, and remember its
348 * file offset.
349 */
350 {
351 struct trie_leaf leaf;
352
353 leaf.c.type = TRIE_LEAF;
354 leaf.file = *file; /* structure copy */
355
356 tb_align(tb);
357 tb->lastoff = tb->offset;
358 tb_write(tb, &leaf, sizeof(leaf));
359 }
360
361 /*
362 * Store this pathname for comparison with the next one.
363 */
364 if (tb->lastsize < pathlen+1) {
365 tb->lastsize = pathlen * 3 / 2 + 64;
366 tb->lastpath = sresize(tb->lastpath, tb->lastsize, char);
367 }
368 strcpy(tb->lastpath, pathname);
369 tb->lastlen = pathlen;
370 }
371
372 int triebuild_finish(triebuild *tb)
373 {
374 struct trie_header th;
375
376 th.magic = TRIE_MAGIC;
377 th.root = triebuild_unwind(tb, 0, &th.count);
378 th.indexroot = 0;
379 th.maxpathlen = tb->maxpathlen;
380 th.pathsep = (unsigned char)pathsep;
381
382 tb_seek(tb, 0);
383 tb_write(tb, &th, sizeof(th));
384
385 return th.count;
386 }
387
388 void triebuild_free(triebuild *tb)
389 {
390 sfree(tb->switches);
391 sfree(tb->lastpath);
392 sfree(tb);
393 }
394
395 /* ----------------------------------------------------------------------
396 * Querying functions.
397 */
398
399 #define NODE(t, off, type) \
400 ((const struct type *)((const char *)(t) + (off)))
401
402 size_t trie_maxpathlen(const void *t)
403 {
404 const struct trie_header *hdr = NODE(t, 0, trie_header);
405 return hdr->maxpathlen;
406 }
407
408 unsigned long trie_before(const void *t, const char *pathname)
409 {
410 const struct trie_header *hdr = NODE(t, 0, trie_header);
411 int ret = 0, lastcount = hdr->count;
412 int len = 1 + strlen(pathname), depth = 0;
413 off_t off = hdr->root;
414
415 while (1) {
416 const struct trie_common *node = NODE(t, off, trie_common);
417 if (node->type == TRIE_LEAF) {
418 if (depth < len)
419 ret += lastcount; /* _shouldn't_ happen, but in principle */
420 return ret;
421 } else if (node->type == TRIE_STRING) {
422 const struct trie_string *st = NODE(t, off, trie_string);
423
424 int offset;
425 int cmp = triencmp(st->string, st->stringlen,
426 pathname + depth, len-depth, &offset);
427
428 if (offset < st->stringlen) {
429 if (cmp < 0)
430 ret += lastcount;
431 return ret;
432 }
433
434 depth += st->stringlen;
435 off = st->subnode;
436 } else if (node->type == TRIE_SWITCH) {
437 const struct trie_switch *sw = NODE(t, off, trie_switch);
438 const char *chars = (const char *)&sw->sw[sw->len];
439 int i;
440
441 for (i = 0; i < sw->len; i++) {
442 int c = chars[i];
443 int cmp = trieccmp(pathname[depth], c);
444 if (cmp > 0)
445 ret += sw->sw[i].subcount;
446 else if (cmp < 0)
447 return ret;
448 else {
449 off = sw->sw[i].subnode;
450 lastcount = sw->sw[i].subcount;
451 depth++;
452 break;
453 }
454 }
455 if (i == sw->len)
456 return ret;
457 }
458 }
459 }
460
461 void trie_getpath(const void *t, unsigned long n, char *buf)
462 {
463 const struct trie_header *hdr = NODE(t, 0, trie_header);
464 int depth = 0;
465 off_t off = hdr->root;
466
467 while (1) {
468 const struct trie_common *node = NODE(t, off, trie_common);
469 if (node->type == TRIE_LEAF) {
470 assert(depth > 0 && buf[depth-1] == '\0');
471 return;
472 } else if (node->type == TRIE_STRING) {
473 const struct trie_string *st = NODE(t, off, trie_string);
474
475 memcpy(buf + depth, st->string, st->stringlen);
476 depth += st->stringlen;
477 off = st->subnode;
478 } else if (node->type == TRIE_SWITCH) {
479 const struct trie_switch *sw = NODE(t, off, trie_switch);
480 const char *chars = (const char *)&sw->sw[sw->len];
481 int i;
482
483 for (i = 0; i < sw->len; i++) {
484 if (n < sw->sw[i].subcount) {
485 buf[depth++] = chars[i];
486 off = sw->sw[i].subnode;
487 break;
488 } else
489 n -= sw->sw[i].subcount;
490 }
491 assert(i < sw->len);
492 }
493 }
494 }
495
496 unsigned long trie_count(const void *t)
497 {
498 const struct trie_header *hdr = NODE(t, 0, trie_header);
499 return hdr->count;
500 }
501
502 char trie_pathsep(const void *t)
503 {
504 const struct trie_header *hdr = NODE(t, 0, trie_header);
505 return (char)hdr->pathsep;
506 }
507
508 struct triewalk_switch {
509 const struct trie_switch *sw;
510 int pos, depth, count;
511 };
512 struct triewalk {
513 const void *t;
514 struct triewalk_switch *switches;
515 int nswitches, switchsize;
516 int count;
517 };
518 triewalk *triewalk_new(const void *vt)
519 {
520 triewalk *tw = snew(triewalk);
521
522 tw->t = (const char *)vt;
523 tw->switches = NULL;
524 tw->switchsize = 0;
525 tw->nswitches = -1;
526 tw->count = 0;
527
528 return tw;
529 }
530 const struct trie_file *triewalk_next(triewalk *tw, char *buf)
531 {
532 off_t off;
533 int depth;
534
535 if (tw->nswitches < 0) {
536 const struct trie_header *hdr = NODE(tw->t, 0, trie_header);
537 off = hdr->root;
538 depth = 0;
539 tw->nswitches = 0;
540 } else {
541 while (1) {
542 int swpos;
543 const struct trie_switch *sw;
544 const char *chars;
545
546 if (tw->nswitches == 0) {
547 assert(tw->count == NODE(tw->t, 0, trie_header)->count);
548 return NULL; /* run out of trie */
549 }
550
551 swpos = tw->switches[tw->nswitches-1].pos;
552 sw = tw->switches[tw->nswitches-1].sw;
553 chars = (const char *)&sw->sw[sw->len];
554
555 if (swpos < sw->len) {
556 depth = tw->switches[tw->nswitches-1].depth;
557 off = sw->sw[swpos].subnode;
558 if (buf)
559 buf[depth++] = chars[swpos];
560 assert(tw->count == tw->switches[tw->nswitches-1].count);
561 tw->switches[tw->nswitches-1].count += sw->sw[swpos].subcount;
562 tw->switches[tw->nswitches-1].pos++;
563 break;
564 }
565
566 tw->nswitches--;
567 }
568 }
569
570 while (1) {
571 const struct trie_common *node = NODE(tw->t, off, trie_common);
572 if (node->type == TRIE_LEAF) {
573 const struct trie_leaf *lf = NODE(tw->t, off, trie_leaf);
574 if (buf)
575 assert(depth > 0 && buf[depth-1] == '\0');
576 tw->count++;
577 return &lf->file;
578 } else if (node->type == TRIE_STRING) {
579 const struct trie_string *st = NODE(tw->t, off, trie_string);
580
581 if (buf)
582 memcpy(buf + depth, st->string, st->stringlen);
583 depth += st->stringlen;
584 off = st->subnode;
585 } else if (node->type == TRIE_SWITCH) {
586 const struct trie_switch *sw = NODE(tw->t, off, trie_switch);
587 const char *chars = (const char *)&sw->sw[sw->len];
588
589 if (tw->nswitches >= tw->switchsize) {
590 tw->switchsize = tw->nswitches * 3 / 2 + 32;
591 tw->switches = sresize(tw->switches, tw->switchsize,
592 struct triewalk_switch);
593 }
594
595 tw->switches[tw->nswitches].sw = sw;
596 tw->switches[tw->nswitches].pos = 1;
597 tw->switches[tw->nswitches].depth = depth;
598 tw->switches[tw->nswitches].count = tw->count + sw->sw[0].subcount;
599 off = sw->sw[0].subnode;
600 if (buf)
601 buf[depth++] = chars[0];
602 tw->nswitches++;
603 }
604 }
605 }
606 void triewalk_free(triewalk *tw)
607 {
608 sfree(tw->switches);
609 sfree(tw);
610 }
611
612 void trie_set_index_offset(void *t, off_t ptr)
613 {
614 ((struct trie_header *)t)->indexroot = ptr;
615 }
616 off_t trie_get_index_offset(const void *t)
617 {
618 return ((const struct trie_header *)t)->indexroot;
619 }