d7482997 |
1 | /* |
2 | * index.c: create and collate index data structures |
3 | */ |
4 | |
5 | #include <stdio.h> |
6 | #include <stdlib.h> |
7 | #include "halibut.h" |
8 | |
9 | static int compare_tags(void *av, void *bv); |
10 | static int compare_entries(void *av, void *bv); |
11 | |
12 | indexdata *make_index(void) { |
13 | indexdata *ret = mknew(indexdata); |
14 | ret->tags = newtree234(compare_tags); |
15 | ret->entries = newtree234(compare_entries); |
16 | return ret; |
17 | } |
18 | |
19 | static indextag *make_indextag(void) { |
20 | indextag *ret = mknew(indextag); |
21 | ret->name = NULL; |
22 | ret->implicit_text = NULL; |
23 | ret->explicit_texts = NULL; |
24 | ret->nexplicit = ret->explicit_size = ret->nrefs = 0; |
25 | ret->refs = NULL; |
26 | return ret; |
27 | } |
28 | |
29 | static int compare_tags(void *av, void *bv) { |
30 | indextag *a = (indextag *)av, *b = (indextag *)bv; |
31 | return ustricmp(a->name, b->name); |
32 | } |
33 | |
34 | static int compare_to_find_tag(void *av, void *bv) { |
35 | wchar_t *a = (wchar_t *)av; |
36 | indextag *b = (indextag *)bv; |
37 | return ustricmp(a, b->name); |
38 | } |
39 | |
40 | static int compare_entries(void *av, void *bv) { |
41 | indexentry *a = (indexentry *)av, *b = (indexentry *)bv; |
42 | return compare_wordlists(a->text, b->text); |
43 | } |
44 | |
45 | /* |
46 | * Back-end utility: find the indextag with a given name. |
47 | */ |
48 | indextag *index_findtag(indexdata *idx, wchar_t *name) { |
49 | return find234(idx->tags, name, compare_to_find_tag); |
50 | } |
51 | |
52 | /* |
53 | * Add a \IM. `tags' points to a zero-terminated chain of |
54 | * zero-terminated strings ("first\0second\0thirdandlast\0\0"). |
55 | * `text' points to a word list. |
56 | * |
57 | * Guarantee on calling sequence: all implicit merges are given |
58 | * before the explicit ones. |
59 | */ |
60 | void index_merge(indexdata *idx, int is_explicit, wchar_t *tags, word *text) { |
61 | indextag *t, *existing; |
62 | |
63 | /* |
8856f150 |
64 | * For an implicit merge, we want to remove all emphasis, |
65 | * because the chances are that the user didn't really want to |
66 | * index the term as emphasised. |
67 | */ |
68 | { |
69 | word *w; |
70 | |
71 | for (w = text; w; w = w->next) { |
72 | if (w->type == word_Emph) |
73 | w->type = word_Normal; |
74 | else if (w->type == word_EmphSpace) |
75 | w->type = word_WhiteSpace; |
76 | else if (w->type == word_EmphQuote) |
77 | w->type = word_Quote; |
78 | } |
79 | } |
80 | |
81 | /* |
d7482997 |
82 | * FIXME: want to warn on overlapping source sets. |
83 | */ |
84 | for (; *tags; tags = uadv(tags)) { |
85 | t = make_indextag(); |
86 | t->name = tags; |
87 | existing = add234(idx->tags, t); |
88 | if (existing == t) { |
89 | /* |
90 | * Duplicate this so we can free it independently. |
91 | */ |
92 | t->name = ustrdup(tags); |
93 | |
94 | /* |
95 | * Every tag has an implicit \IM. So if this tag |
96 | * doesn't exist and we're explicit, then we should |
97 | * warn (and drop it, since it won't be referenced). |
98 | */ |
99 | if (is_explicit) { |
100 | error(err_nosuchidxtag, tags); |
101 | continue; |
102 | } |
103 | |
104 | /* |
105 | * Otherwise, this is a new tag with an implicit \IM. |
106 | */ |
107 | t->implicit_text = text; |
108 | } else { |
109 | sfree(t); |
110 | t = existing; |
111 | if (!is_explicit) { |
112 | /* |
113 | * An implicit \IM for a tag that's had an implicit |
114 | * \IM before. FIXME: we should check the text |
115 | * against the existing text and warn on |
116 | * differences. And check the tag for case match |
117 | * against the existing tag, likewise. |
118 | */ |
119 | } else { |
120 | /* |
121 | * An explicit \IM added to a valid tag. In |
122 | * particular, this removes the implicit \IM if |
123 | * present. |
124 | */ |
125 | if (t->implicit_text) { |
126 | free_word_list(t->implicit_text); |
127 | t->implicit_text = NULL; |
128 | } |
129 | if (t->nexplicit >= t->explicit_size) { |
130 | t->explicit_size = t->nexplicit + 8; |
131 | t->explicit_texts = resize(t->explicit_texts, |
132 | t->explicit_size); |
133 | } |
134 | t->explicit_texts[t->nexplicit++] = text; |
135 | } |
136 | } |
137 | } |
138 | } |
139 | |
140 | /* |
141 | * Build the final-form index. We now have every tag, with every |
142 | * \IM, set up in a 2-3 tree indexed by tag. We now want to collate |
143 | * the RHSes of the \IMs, and sort by final form, and decorate the |
144 | * entries in the original 2-3 tree with pointers to the RHS |
145 | * entries. |
146 | */ |
147 | void build_index(indexdata *i) { |
148 | indextag *t; |
149 | word **ta; |
150 | int ti; |
151 | int j; |
152 | |
153 | for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) { |
154 | if (t->implicit_text) { |
155 | t->nrefs = 1; |
156 | ta = &t->implicit_text; |
157 | } else { |
158 | t->nrefs = t->nexplicit; |
159 | ta = t->explicit_texts; |
160 | } |
161 | if (t->nrefs) { |
162 | t->refs = mknewa(indexentry *, t->nrefs); |
163 | for (j = 0; j < t->nrefs; j++) { |
164 | indexentry *ent = mknew(indexentry); |
165 | ent->text = *ta++; |
166 | t->refs[j] = add234(i->entries, ent); |
167 | if (t->refs[j] != ent) /* duplicate */ |
168 | sfree(ent); |
169 | } |
170 | } |
171 | } |
172 | } |
173 | |
174 | void cleanup_index(indexdata *i) { |
175 | indextag *t; |
176 | indexentry *ent; |
177 | int ti; |
178 | |
179 | for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) { |
180 | sfree(t->name); |
181 | free_word_list(t->implicit_text); |
182 | sfree(t->explicit_texts); |
183 | sfree(t->refs); |
184 | sfree(t); |
185 | } |
186 | freetree234(i->tags); |
187 | for (ti = 0; (ent = (indexentry *)index234(i->entries, ti))!=NULL; ti++) { |
188 | sfree(ent); |
189 | } |
190 | freetree234(i->entries); |
191 | sfree(i); |
192 | } |
193 | |
194 | static void dbg_prtwordlist(int level, word *w); |
195 | static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text); |
196 | |
197 | void index_debug(indexdata *i) { |
198 | indextag *t; |
199 | indexentry *y; |
200 | int ti; |
201 | int j; |
202 | |
203 | printf("\nINDEX TAGS\n==========\n\n"); |
204 | for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) { |
205 | printf("\n"); |
206 | if (t->implicit_text) |
207 | dbg_prtmerge(0, t->name, t->implicit_text); |
208 | for (j = 0; j < t->nexplicit; j++) |
209 | dbg_prtmerge(1, t->name, t->explicit_texts[j]); |
210 | } |
211 | |
212 | printf("\nINDEX ENTRIES\n=============\n\n"); |
213 | for (ti = 0; (y = (indexentry *)index234(i->entries, ti)) != NULL; ti++) { |
214 | printf("\n"); |
215 | printf("{\n"); |
216 | dbg_prtwordlist(1, y->text); |
217 | printf("}\n"); |
218 | } |
219 | } |
220 | |
221 | static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text) { |
222 | printf("\\IM: %splicit: \"", is_explicit ? "ex" : "im"); |
223 | for (; *tag; tag++) |
224 | putchar(*tag); |
225 | printf("\" {\n"); |
226 | dbg_prtwordlist(1, text); |
227 | printf("}\n"); |
228 | } |
229 | |
230 | static void dbg_prtwordlist(int level, word *w) { |
231 | for (; w; w = w->next) { |
232 | wchar_t *wp; |
233 | printf("%*sword %d ", level*4, "", w->type); |
234 | if (w->text) { |
235 | printf("\""); |
236 | for (wp = w->text; *wp; wp++) |
237 | putchar(*wp); |
238 | printf("\""); |
239 | } else |
240 | printf("(no text)"); |
241 | if (w->alt) { |
242 | printf(" alt = {\n"); |
243 | dbg_prtwordlist(level+1, w->alt); |
244 | printf("%*s}", level*4, ""); |
245 | } |
246 | printf("\n"); |
247 | } |
248 | } |