I always meant to remove emphasis in the index by default. Now I do
[sgt/halibut] / index.c
1 /*
2 * index.c: create and collate index data structures
3 */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include "halibut.h"
8
9 static int compare_tags(void *av, void *bv);
10 static int compare_entries(void *av, void *bv);
11
12 indexdata *make_index(void) {
13 indexdata *ret = mknew(indexdata);
14 ret->tags = newtree234(compare_tags);
15 ret->entries = newtree234(compare_entries);
16 return ret;
17 }
18
19 static indextag *make_indextag(void) {
20 indextag *ret = mknew(indextag);
21 ret->name = NULL;
22 ret->implicit_text = NULL;
23 ret->explicit_texts = NULL;
24 ret->nexplicit = ret->explicit_size = ret->nrefs = 0;
25 ret->refs = NULL;
26 return ret;
27 }
28
29 static int compare_tags(void *av, void *bv) {
30 indextag *a = (indextag *)av, *b = (indextag *)bv;
31 return ustricmp(a->name, b->name);
32 }
33
34 static int compare_to_find_tag(void *av, void *bv) {
35 wchar_t *a = (wchar_t *)av;
36 indextag *b = (indextag *)bv;
37 return ustricmp(a, b->name);
38 }
39
40 static int compare_entries(void *av, void *bv) {
41 indexentry *a = (indexentry *)av, *b = (indexentry *)bv;
42 return compare_wordlists(a->text, b->text);
43 }
44
45 /*
46 * Back-end utility: find the indextag with a given name.
47 */
48 indextag *index_findtag(indexdata *idx, wchar_t *name) {
49 return find234(idx->tags, name, compare_to_find_tag);
50 }
51
52 /*
53 * Add a \IM. `tags' points to a zero-terminated chain of
54 * zero-terminated strings ("first\0second\0thirdandlast\0\0").
55 * `text' points to a word list.
56 *
57 * Guarantee on calling sequence: all implicit merges are given
58 * before the explicit ones.
59 */
60 void index_merge(indexdata *idx, int is_explicit, wchar_t *tags, word *text) {
61 indextag *t, *existing;
62
63 /*
64 * For an implicit merge, we want to remove all emphasis,
65 * because the chances are that the user didn't really want to
66 * index the term as emphasised.
67 */
68 {
69 word *w;
70
71 for (w = text; w; w = w->next) {
72 if (w->type == word_Emph)
73 w->type = word_Normal;
74 else if (w->type == word_EmphSpace)
75 w->type = word_WhiteSpace;
76 else if (w->type == word_EmphQuote)
77 w->type = word_Quote;
78 }
79 }
80
81 /*
82 * FIXME: want to warn on overlapping source sets.
83 */
84 for (; *tags; tags = uadv(tags)) {
85 t = make_indextag();
86 t->name = tags;
87 existing = add234(idx->tags, t);
88 if (existing == t) {
89 /*
90 * Duplicate this so we can free it independently.
91 */
92 t->name = ustrdup(tags);
93
94 /*
95 * Every tag has an implicit \IM. So if this tag
96 * doesn't exist and we're explicit, then we should
97 * warn (and drop it, since it won't be referenced).
98 */
99 if (is_explicit) {
100 error(err_nosuchidxtag, tags);
101 continue;
102 }
103
104 /*
105 * Otherwise, this is a new tag with an implicit \IM.
106 */
107 t->implicit_text = text;
108 } else {
109 sfree(t);
110 t = existing;
111 if (!is_explicit) {
112 /*
113 * An implicit \IM for a tag that's had an implicit
114 * \IM before. FIXME: we should check the text
115 * against the existing text and warn on
116 * differences. And check the tag for case match
117 * against the existing tag, likewise.
118 */
119 } else {
120 /*
121 * An explicit \IM added to a valid tag. In
122 * particular, this removes the implicit \IM if
123 * present.
124 */
125 if (t->implicit_text) {
126 free_word_list(t->implicit_text);
127 t->implicit_text = NULL;
128 }
129 if (t->nexplicit >= t->explicit_size) {
130 t->explicit_size = t->nexplicit + 8;
131 t->explicit_texts = resize(t->explicit_texts,
132 t->explicit_size);
133 }
134 t->explicit_texts[t->nexplicit++] = text;
135 }
136 }
137 }
138 }
139
140 /*
141 * Build the final-form index. We now have every tag, with every
142 * \IM, set up in a 2-3 tree indexed by tag. We now want to collate
143 * the RHSes of the \IMs, and sort by final form, and decorate the
144 * entries in the original 2-3 tree with pointers to the RHS
145 * entries.
146 */
147 void build_index(indexdata *i) {
148 indextag *t;
149 word **ta;
150 int ti;
151 int j;
152
153 for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
154 if (t->implicit_text) {
155 t->nrefs = 1;
156 ta = &t->implicit_text;
157 } else {
158 t->nrefs = t->nexplicit;
159 ta = t->explicit_texts;
160 }
161 if (t->nrefs) {
162 t->refs = mknewa(indexentry *, t->nrefs);
163 for (j = 0; j < t->nrefs; j++) {
164 indexentry *ent = mknew(indexentry);
165 ent->text = *ta++;
166 t->refs[j] = add234(i->entries, ent);
167 if (t->refs[j] != ent) /* duplicate */
168 sfree(ent);
169 }
170 }
171 }
172 }
173
174 void cleanup_index(indexdata *i) {
175 indextag *t;
176 indexentry *ent;
177 int ti;
178
179 for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
180 sfree(t->name);
181 free_word_list(t->implicit_text);
182 sfree(t->explicit_texts);
183 sfree(t->refs);
184 sfree(t);
185 }
186 freetree234(i->tags);
187 for (ti = 0; (ent = (indexentry *)index234(i->entries, ti))!=NULL; ti++) {
188 sfree(ent);
189 }
190 freetree234(i->entries);
191 sfree(i);
192 }
193
194 static void dbg_prtwordlist(int level, word *w);
195 static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text);
196
197 void index_debug(indexdata *i) {
198 indextag *t;
199 indexentry *y;
200 int ti;
201 int j;
202
203 printf("\nINDEX TAGS\n==========\n\n");
204 for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
205 printf("\n");
206 if (t->implicit_text)
207 dbg_prtmerge(0, t->name, t->implicit_text);
208 for (j = 0; j < t->nexplicit; j++)
209 dbg_prtmerge(1, t->name, t->explicit_texts[j]);
210 }
211
212 printf("\nINDEX ENTRIES\n=============\n\n");
213 for (ti = 0; (y = (indexentry *)index234(i->entries, ti)) != NULL; ti++) {
214 printf("\n");
215 printf("{\n");
216 dbg_prtwordlist(1, y->text);
217 printf("}\n");
218 }
219 }
220
221 static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text) {
222 printf("\\IM: %splicit: \"", is_explicit ? "ex" : "im");
223 for (; *tag; tag++)
224 putchar(*tag);
225 printf("\" {\n");
226 dbg_prtwordlist(1, text);
227 printf("}\n");
228 }
229
230 static void dbg_prtwordlist(int level, word *w) {
231 for (; w; w = w->next) {
232 wchar_t *wp;
233 printf("%*sword %d ", level*4, "", w->type);
234 if (w->text) {
235 printf("\"");
236 for (wp = w->text; *wp; wp++)
237 putchar(*wp);
238 printf("\"");
239 } else
240 printf("(no text)");
241 if (w->alt) {
242 printf(" alt = {\n");
243 dbg_prtwordlist(level+1, w->alt);
244 printf("%*s}", level*4, "");
245 }
246 printf("\n");
247 }
248 }