Add an error check for correct formatting in Deflate uncompressed
[sgt/halibut] / index.c
1 /*
2 * index.c: create and collate index data structures
3 */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include "halibut.h"
8
9 static int compare_tags(void *av, void *bv);
10 static int compare_entries(void *av, void *bv);
11
12 indexdata *make_index(void) {
13 indexdata *ret = snew(indexdata);
14 ret->tags = newtree234(compare_tags);
15 ret->entries = newtree234(compare_entries);
16 return ret;
17 }
18
19 static indextag *make_indextag(void) {
20 indextag *ret = snew(indextag);
21 ret->name = NULL;
22 ret->implicit_text = NULL;
23 ret->explicit_texts = NULL;
24 ret->explicit_fpos = NULL;
25 ret->nexplicit = ret->explicit_size = ret->nrefs = 0;
26 ret->refs = NULL;
27 return ret;
28 }
29
30 static int compare_tags(void *av, void *bv) {
31 indextag *a = (indextag *)av, *b = (indextag *)bv;
32 return ustricmp(a->name, b->name);
33 }
34
35 static int compare_to_find_tag(void *av, void *bv) {
36 wchar_t *a = (wchar_t *)av;
37 indextag *b = (indextag *)bv;
38 return ustricmp(a, b->name);
39 }
40
41 static int compare_entries(void *av, void *bv) {
42 indexentry *a = (indexentry *)av, *b = (indexentry *)bv;
43 return compare_wordlists(a->text, b->text);
44 }
45
46 /*
47 * Back-end utility: find the indextag with a given name.
48 */
49 indextag *index_findtag(indexdata *idx, wchar_t *name) {
50 return find234(idx->tags, name, compare_to_find_tag);
51 }
52
53 /*
54 * Add a \IM. `tags' points to a zero-terminated chain of
55 * zero-terminated strings ("first\0second\0thirdandlast\0\0").
56 * `text' points to a word list.
57 *
58 * Guarantee on calling sequence: all implicit merges are given
59 * before the explicit ones.
60 */
61 void index_merge(indexdata *idx, int is_explicit, wchar_t *tags, word *text,
62 filepos *fpos) {
63 indextag *t, *existing;
64
65 /*
66 * For an implicit merge, we want to remove all emphasis,
67 * because the chances are that the user didn't really want to
68 * index the term as emphasised.
69 */
70 {
71 word *w;
72
73 for (w = text; w; w = w->next) {
74 if (w->type == word_Emph)
75 w->type = word_Normal;
76 else if (w->type == word_EmphSpace)
77 w->type = word_WhiteSpace;
78 else if (w->type == word_EmphQuote)
79 w->type = word_Quote;
80 }
81 }
82
83 /*
84 * FIXME: want to warn on overlapping source sets.
85 */
86 for (; *tags; tags = uadv(tags)) {
87 t = make_indextag();
88 t->name = tags;
89 existing = add234(idx->tags, t);
90 if (existing == t) {
91 /*
92 * Duplicate this so we can free it independently.
93 */
94 t->name = ustrdup(tags);
95
96 /*
97 * Every tag has an implicit \IM. So if this tag
98 * doesn't exist and we're explicit, then we should
99 * warn (and drop it, since it won't be referenced).
100 */
101 if (is_explicit) {
102 error(err_nosuchidxtag, fpos, tags);
103 continue;
104 }
105
106 /*
107 * Otherwise, this is a new tag with an implicit \IM.
108 */
109 t->implicit_text = text;
110 t->implicit_fpos = *fpos;
111 } else {
112 if (!is_explicit) {
113 /*
114 * An implicit \IM for a tag that's had an implicit
115 * \IM before. FIXME: we should check the text
116 * against the existing text and warn on
117 * differences. And check the tag for case match
118 * against the existing tag, likewise.
119 */
120
121 /*
122 * Check the tag against its previous occurrence to
123 * see if the cases match.
124 */
125 if (ustrcmp(t->name, existing->name)) {
126 error(err_indexcase, fpos, t->name,
127 &existing->implicit_fpos, existing->name);
128 }
129
130 sfree(t);
131 } else {
132 /*
133 * An explicit \IM added to a valid tag. In
134 * particular, this removes the implicit \IM if
135 * present.
136 */
137 sfree(t);
138 t = existing;
139 if (t->implicit_text) {
140 free_word_list(t->implicit_text);
141 t->implicit_text = NULL;
142 }
143 if (t->nexplicit >= t->explicit_size) {
144 t->explicit_size = t->nexplicit + 8;
145 t->explicit_texts = sresize(t->explicit_texts,
146 t->explicit_size, word *);
147 t->explicit_fpos = sresize(t->explicit_fpos,
148 t->explicit_size, filepos);
149 }
150 t->explicit_texts[t->nexplicit] = text;
151 t->explicit_fpos[t->nexplicit] = *fpos;
152 t->nexplicit++;
153 }
154 }
155 }
156 }
157
158 /*
159 * Build the final-form index. We now have every tag, with every
160 * \IM, set up in a 2-3 tree indexed by tag. We now want to collate
161 * the RHSes of the \IMs, and sort by final form, and decorate the
162 * entries in the original 2-3 tree with pointers to the RHS
163 * entries.
164 */
165 void build_index(indexdata *i) {
166 indextag *t;
167 word **ta;
168 filepos *fa;
169 int ti;
170 int j;
171
172 for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
173 if (t->implicit_text) {
174 t->nrefs = 1;
175 ta = &t->implicit_text;
176 fa = &t->implicit_fpos;
177 } else {
178 t->nrefs = t->nexplicit;
179 ta = t->explicit_texts;
180 fa = t->explicit_fpos;
181 }
182 if (t->nrefs) {
183 t->refs = snewn(t->nrefs, indexentry *);
184 for (j = 0; j < t->nrefs; j++) {
185 indexentry *ent = snew(indexentry);
186 ent->text = *ta++;
187 ent->fpos = *fa++;
188 t->refs[j] = add234(i->entries, ent);
189 if (t->refs[j] != ent) /* duplicate */
190 sfree(ent);
191 }
192 }
193 }
194 }
195
196 void cleanup_index(indexdata *i) {
197 indextag *t;
198 indexentry *ent;
199 int ti;
200
201 for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
202 sfree(t->name);
203 free_word_list(t->implicit_text);
204 sfree(t->explicit_texts);
205 sfree(t->refs);
206 sfree(t);
207 }
208 freetree234(i->tags);
209 for (ti = 0; (ent = (indexentry *)index234(i->entries, ti))!=NULL; ti++) {
210 sfree(ent);
211 }
212 freetree234(i->entries);
213 sfree(i);
214 }
215
216 static void dbg_prtwordlist(int level, word *w);
217 static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text);
218
219 void index_debug(indexdata *i) {
220 indextag *t;
221 indexentry *y;
222 int ti;
223 int j;
224
225 printf("\nINDEX TAGS\n==========\n\n");
226 for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
227 printf("\n");
228 if (t->implicit_text)
229 dbg_prtmerge(0, t->name, t->implicit_text);
230 for (j = 0; j < t->nexplicit; j++)
231 dbg_prtmerge(1, t->name, t->explicit_texts[j]);
232 }
233
234 printf("\nINDEX ENTRIES\n=============\n\n");
235 for (ti = 0; (y = (indexentry *)index234(i->entries, ti)) != NULL; ti++) {
236 printf("\n");
237 printf("{\n");
238 dbg_prtwordlist(1, y->text);
239 printf("}\n");
240 }
241 }
242
243 static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text) {
244 printf("\\IM: %splicit: \"", is_explicit ? "ex" : "im");
245 for (; *tag; tag++)
246 putchar(*tag);
247 printf("\" {\n");
248 dbg_prtwordlist(1, text);
249 printf("}\n");
250 }
251
252 static void dbg_prtwordlist(int level, word *w) {
253 for (; w; w = w->next) {
254 wchar_t *wp;
255 printf("%*sword %d ", level*4, "", w->type);
256 if (w->text) {
257 printf("\"");
258 for (wp = w->text; *wp; wp++)
259 putchar(*wp);
260 printf("\"");
261 } else
262 printf("(no text)");
263 if (w->alt) {
264 printf(" alt = {\n");
265 dbg_prtwordlist(level+1, w->alt);
266 printf("%*s}", level*4, "");
267 }
268 printf("\n");
269 }
270 }