| 1 | /* |
| 2 | * index.c: create and collate index data structures |
| 3 | */ |
| 4 | |
| 5 | #include <stdio.h> |
| 6 | #include <stdlib.h> |
| 7 | #include "halibut.h" |
| 8 | |
| 9 | static int compare_tags(void *av, void *bv); |
| 10 | static int compare_entries(void *av, void *bv); |
| 11 | |
| 12 | indexdata *make_index(void) { |
| 13 | indexdata *ret = snew(indexdata); |
| 14 | ret->tags = newtree234(compare_tags); |
| 15 | ret->entries = newtree234(compare_entries); |
| 16 | return ret; |
| 17 | } |
| 18 | |
| 19 | static indextag *make_indextag(void) { |
| 20 | indextag *ret = snew(indextag); |
| 21 | ret->name = NULL; |
| 22 | ret->implicit_text = NULL; |
| 23 | ret->explicit_texts = NULL; |
| 24 | ret->explicit_fpos = NULL; |
| 25 | ret->nexplicit = ret->explicit_size = ret->nrefs = 0; |
| 26 | ret->refs = NULL; |
| 27 | return ret; |
| 28 | } |
| 29 | |
| 30 | static int compare_tags(void *av, void *bv) { |
| 31 | indextag *a = (indextag *)av, *b = (indextag *)bv; |
| 32 | return ustricmp(a->name, b->name); |
| 33 | } |
| 34 | |
| 35 | static int compare_to_find_tag(void *av, void *bv) { |
| 36 | wchar_t *a = (wchar_t *)av; |
| 37 | indextag *b = (indextag *)bv; |
| 38 | return ustricmp(a, b->name); |
| 39 | } |
| 40 | |
| 41 | static int compare_entries(void *av, void *bv) { |
| 42 | indexentry *a = (indexentry *)av, *b = (indexentry *)bv; |
| 43 | return compare_wordlists(a->text, b->text); |
| 44 | } |
| 45 | |
| 46 | /* |
| 47 | * Back-end utility: find the indextag with a given name. |
| 48 | */ |
| 49 | indextag *index_findtag(indexdata *idx, wchar_t *name) { |
| 50 | return find234(idx->tags, name, compare_to_find_tag); |
| 51 | } |
| 52 | |
| 53 | /* |
| 54 | * Add a \IM. `tags' points to a zero-terminated chain of |
| 55 | * zero-terminated strings ("first\0second\0thirdandlast\0\0"). |
| 56 | * `text' points to a word list. |
| 57 | * |
| 58 | * Guarantee on calling sequence: all implicit merges are given |
| 59 | * before the explicit ones. |
| 60 | */ |
| 61 | void index_merge(indexdata *idx, int is_explicit, wchar_t *tags, word *text, |
| 62 | filepos *fpos) { |
| 63 | indextag *t, *existing; |
| 64 | |
| 65 | /* |
| 66 | * For an implicit merge, we want to remove all emphasis, |
| 67 | * because the chances are that the user didn't really want to |
| 68 | * index the term as emphasised. |
| 69 | */ |
| 70 | { |
| 71 | word *w; |
| 72 | |
| 73 | for (w = text; w; w = w->next) { |
| 74 | if (w->type == word_Emph) |
| 75 | w->type = word_Normal; |
| 76 | else if (w->type == word_EmphSpace) |
| 77 | w->type = word_WhiteSpace; |
| 78 | else if (w->type == word_EmphQuote) |
| 79 | w->type = word_Quote; |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | /* |
| 84 | * FIXME: want to warn on overlapping source sets. |
| 85 | */ |
| 86 | for (; *tags; tags = uadv(tags)) { |
| 87 | t = make_indextag(); |
| 88 | t->name = tags; |
| 89 | existing = add234(idx->tags, t); |
| 90 | if (existing == t) { |
| 91 | /* |
| 92 | * Duplicate this so we can free it independently. |
| 93 | */ |
| 94 | t->name = ustrdup(tags); |
| 95 | |
| 96 | /* |
| 97 | * Every tag has an implicit \IM. So if this tag |
| 98 | * doesn't exist and we're explicit, then we should |
| 99 | * warn (and drop it, since it won't be referenced). |
| 100 | */ |
| 101 | if (is_explicit) { |
| 102 | error(err_nosuchidxtag, fpos, tags); |
| 103 | continue; |
| 104 | } |
| 105 | |
| 106 | /* |
| 107 | * Otherwise, this is a new tag with an implicit \IM. |
| 108 | */ |
| 109 | t->implicit_text = text; |
| 110 | t->implicit_fpos = *fpos; |
| 111 | } else { |
| 112 | if (!is_explicit) { |
| 113 | /* |
| 114 | * An implicit \IM for a tag that's had an implicit |
| 115 | * \IM before. FIXME: we should check the text |
| 116 | * against the existing text and warn on |
| 117 | * differences. And check the tag for case match |
| 118 | * against the existing tag, likewise. |
| 119 | */ |
| 120 | |
| 121 | /* |
| 122 | * Check the tag against its previous occurrence to |
| 123 | * see if the cases match. |
| 124 | */ |
| 125 | if (ustrcmp(t->name, existing->name)) { |
| 126 | error(err_indexcase, fpos, t->name, |
| 127 | &existing->implicit_fpos, existing->name); |
| 128 | } |
| 129 | |
| 130 | sfree(t); |
| 131 | } else { |
| 132 | /* |
| 133 | * An explicit \IM added to a valid tag. In |
| 134 | * particular, this removes the implicit \IM if |
| 135 | * present. |
| 136 | */ |
| 137 | sfree(t); |
| 138 | t = existing; |
| 139 | if (t->implicit_text) { |
| 140 | free_word_list(t->implicit_text); |
| 141 | t->implicit_text = NULL; |
| 142 | } |
| 143 | if (t->nexplicit >= t->explicit_size) { |
| 144 | t->explicit_size = t->nexplicit + 8; |
| 145 | t->explicit_texts = sresize(t->explicit_texts, |
| 146 | t->explicit_size, word *); |
| 147 | t->explicit_fpos = sresize(t->explicit_fpos, |
| 148 | t->explicit_size, filepos); |
| 149 | } |
| 150 | t->explicit_texts[t->nexplicit] = text; |
| 151 | t->explicit_fpos[t->nexplicit] = *fpos; |
| 152 | t->nexplicit++; |
| 153 | } |
| 154 | } |
| 155 | } |
| 156 | } |
| 157 | |
| 158 | /* |
| 159 | * Build the final-form index. We now have every tag, with every |
| 160 | * \IM, set up in a 2-3 tree indexed by tag. We now want to collate |
| 161 | * the RHSes of the \IMs, and sort by final form, and decorate the |
| 162 | * entries in the original 2-3 tree with pointers to the RHS |
| 163 | * entries. |
| 164 | */ |
| 165 | void build_index(indexdata *i) { |
| 166 | indextag *t; |
| 167 | word **ta; |
| 168 | filepos *fa; |
| 169 | int ti; |
| 170 | int j; |
| 171 | |
| 172 | for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) { |
| 173 | if (t->implicit_text) { |
| 174 | t->nrefs = 1; |
| 175 | ta = &t->implicit_text; |
| 176 | fa = &t->implicit_fpos; |
| 177 | } else { |
| 178 | t->nrefs = t->nexplicit; |
| 179 | ta = t->explicit_texts; |
| 180 | fa = t->explicit_fpos; |
| 181 | } |
| 182 | if (t->nrefs) { |
| 183 | t->refs = snewn(t->nrefs, indexentry *); |
| 184 | for (j = 0; j < t->nrefs; j++) { |
| 185 | indexentry *ent = snew(indexentry); |
| 186 | ent->text = *ta++; |
| 187 | ent->fpos = *fa++; |
| 188 | t->refs[j] = add234(i->entries, ent); |
| 189 | if (t->refs[j] != ent) /* duplicate */ |
| 190 | sfree(ent); |
| 191 | } |
| 192 | } |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | void cleanup_index(indexdata *i) { |
| 197 | indextag *t; |
| 198 | indexentry *ent; |
| 199 | int ti; |
| 200 | |
| 201 | for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) { |
| 202 | sfree(t->name); |
| 203 | free_word_list(t->implicit_text); |
| 204 | sfree(t->explicit_texts); |
| 205 | sfree(t->refs); |
| 206 | sfree(t); |
| 207 | } |
| 208 | freetree234(i->tags); |
| 209 | for (ti = 0; (ent = (indexentry *)index234(i->entries, ti))!=NULL; ti++) { |
| 210 | sfree(ent); |
| 211 | } |
| 212 | freetree234(i->entries); |
| 213 | sfree(i); |
| 214 | } |
| 215 | |
| 216 | static void dbg_prtwordlist(int level, word *w); |
| 217 | static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text); |
| 218 | |
| 219 | void index_debug(indexdata *i) { |
| 220 | indextag *t; |
| 221 | indexentry *y; |
| 222 | int ti; |
| 223 | int j; |
| 224 | |
| 225 | printf("\nINDEX TAGS\n==========\n\n"); |
| 226 | for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) { |
| 227 | printf("\n"); |
| 228 | if (t->implicit_text) |
| 229 | dbg_prtmerge(0, t->name, t->implicit_text); |
| 230 | for (j = 0; j < t->nexplicit; j++) |
| 231 | dbg_prtmerge(1, t->name, t->explicit_texts[j]); |
| 232 | } |
| 233 | |
| 234 | printf("\nINDEX ENTRIES\n=============\n\n"); |
| 235 | for (ti = 0; (y = (indexentry *)index234(i->entries, ti)) != NULL; ti++) { |
| 236 | printf("\n"); |
| 237 | printf("{\n"); |
| 238 | dbg_prtwordlist(1, y->text); |
| 239 | printf("}\n"); |
| 240 | } |
| 241 | } |
| 242 | |
| 243 | static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text) { |
| 244 | printf("\\IM: %splicit: \"", is_explicit ? "ex" : "im"); |
| 245 | for (; *tag; tag++) |
| 246 | putchar(*tag); |
| 247 | printf("\" {\n"); |
| 248 | dbg_prtwordlist(1, text); |
| 249 | printf("}\n"); |
| 250 | } |
| 251 | |
| 252 | static void dbg_prtwordlist(int level, word *w) { |
| 253 | for (; w; w = w->next) { |
| 254 | wchar_t *wp; |
| 255 | printf("%*sword %d ", level*4, "", w->type); |
| 256 | if (w->text) { |
| 257 | printf("\""); |
| 258 | for (wp = w->text; *wp; wp++) |
| 259 | putchar(*wp); |
| 260 | printf("\""); |
| 261 | } else |
| 262 | printf("(no text)"); |
| 263 | if (w->alt) { |
| 264 | printf(" alt = {\n"); |
| 265 | dbg_prtwordlist(level+1, w->alt); |
| 266 | printf("%*s}", level*4, ""); |
| 267 | } |
| 268 | printf("\n"); |
| 269 | } |
| 270 | } |