mdw@git.distorted.org.uk Git - sgt/halibut/blob - index.c

   1 /*
   2  * index.c: create and collate index data structures
   3  */
   4
   5 #include <stdio.h>
   6 #include <stdlib.h>
   7 #include "halibut.h"
   8
   9 static int compare_tags(void *av, void *bv);
  10 static int compare_entries(void *av, void *bv);
  11
  12 indexdata *make_index(void) {
  13     indexdata *ret = snew(indexdata);
  14     ret->tags = newtree234(compare_tags);
  15     ret->entries = newtree234(compare_entries);
  16     return ret;
  17 }
  18
  19 static indextag *make_indextag(void) {
  20     indextag *ret = snew(indextag);
  21     ret->name = NULL;
  22     ret->implicit_text = NULL;
  23     ret->explicit_texts = NULL;
  24     ret->explicit_fpos = NULL;
  25     ret->nexplicit = ret->explicit_size = ret->nrefs = 0;
  26     ret->refs = NULL;
  27     return ret;
  28 }
  29
  30 static int compare_tags(void *av, void *bv) {
  31     indextag *a = (indextag *)av, *b = (indextag *)bv;
  32     return ustricmp(a->name, b->name);
  33 }
  34
  35 static int compare_to_find_tag(void *av, void *bv) {
  36     wchar_t *a = (wchar_t *)av;
  37     indextag *b = (indextag *)bv;
  38     return ustricmp(a, b->name);
  39 }
  40
  41 static int compare_entries(void *av, void *bv) {
  42     indexentry *a = (indexentry *)av, *b = (indexentry *)bv;
  43     return compare_wordlists(a->text, b->text);
  44 }
  45
  46 /*
  47  * Back-end utility: find the indextag with a given name.
  48  */
  49 indextag *index_findtag(indexdata *idx, wchar_t *name) {
  50     return find234(idx->tags, name, compare_to_find_tag);
  51 }
  52
  53 /*
  54  * Add a \IM. `tags' points to a zero-terminated chain of
  55  * zero-terminated strings ("first\0second\0thirdandlast\0\0").
  56  * `text' points to a word list.
  57  *
  58  * Guarantee on calling sequence: all implicit merges are given
  59  * before the explicit ones.
  60  */
  61 void index_merge(indexdata *idx, int is_explicit, wchar_t *tags, word *text,
  62                  filepos *fpos) {
  63     indextag *t, *existing;
  64
  65     /*
  66      * For an implicit merge, we want to remove all emphasis,
  67      * because the chances are that the user didn't really want to
  68      * index the term as emphasised.
  69      */
  70     {
  71         word *w;
  72
  73         for (w = text; w; w = w->next) {
  74             if (w->type == word_Emph)
  75                 w->type = word_Normal;
  76             else if (w->type == word_EmphSpace)
  77                 w->type = word_WhiteSpace;
  78             else if (w->type == word_EmphQuote)
  79                 w->type = word_Quote;
  80         }
  81     }
  82
  83     /*
  84      * FIXME: want to warn on overlapping source sets.
  85      */
  86     for (; *tags; tags = uadv(tags)) {
  87         t = make_indextag();
  88         t->name = tags;
  89         existing = add234(idx->tags, t);
  90         if (existing == t) {
  91             /*
  92              * Duplicate this so we can free it independently.
  93              */
  94             t->name = ustrdup(tags);
  95
  96             /*
  97              * Every tag has an implicit \IM. So if this tag
  98              * doesn't exist and we're explicit, then we should
  99              * warn (and drop it, since it won't be referenced).
 100              */
 101             if (is_explicit) {
 102                 error(err_nosuchidxtag, fpos, tags);
 103                 continue;
 104             }
 105
 106             /*
 107              * Otherwise, this is a new tag with an implicit \IM.
 108              */
 109             t->implicit_text = text;
 110             t->implicit_fpos = *fpos;
 111         } else {
 112             if (!is_explicit) {
 113                 /*
 114                  * An implicit \IM for a tag that's had an implicit
 115                  * \IM before. FIXME: we should check the text
 116                  * against the existing text and warn on
 117                  * differences. And check the tag for case match
 118                  * against the existing tag, likewise.
 119                  */
 120
 121                 /*
 122                  * Check the tag against its previous occurrence to
 123                  * see if the cases match.
 124                  */
 125                 if (ustrcmp(t->name, existing->name)) {
 126                     error(err_indexcase, fpos, t->name,
 127                           &existing->implicit_fpos, existing->name);
 128                 }
 129
 130                 sfree(t);
 131             } else {
 132                 /*
 133                  * An explicit \IM added to a valid tag. In
 134                  * particular, this removes the implicit \IM if
 135                  * present.
 136                  */
 137                 sfree(t);
 138                 t = existing;
 139                 if (t->implicit_text) {
 140                     free_word_list(t->implicit_text);
 141                     t->implicit_text = NULL;
 142                 }
 143                 if (t->nexplicit >= t->explicit_size) {
 144                     t->explicit_size = t->nexplicit + 8;
 145                     t->explicit_texts = sresize(t->explicit_texts,
 146                                                 t->explicit_size, word *);
 147                     t->explicit_fpos = sresize(t->explicit_fpos,
 148                                                t->explicit_size, filepos);
 149                 }
 150                 t->explicit_texts[t->nexplicit] = text;
 151                 t->explicit_fpos[t->nexplicit] = *fpos;
 152                 t->nexplicit++;
 153             }
 154         }
 155     }
 156 }
 157
 158 /*
 159  * Build the final-form index. We now have every tag, with every
 160  * \IM, set up in a 2-3 tree indexed by tag. We now want to collate
 161  * the RHSes of the \IMs, and sort by final form, and decorate the
 162  * entries in the original 2-3 tree with pointers to the RHS
 163  * entries.
 164  */
 165 void build_index(indexdata *i) {
 166     indextag *t;
 167     word **ta;
 168     filepos *fa;
 169     int ti;
 170     int j;
 171
 172     for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
 173         if (t->implicit_text) {
 174             t->nrefs = 1;
 175             ta = &t->implicit_text;
 176             fa = &t->implicit_fpos;
 177         } else {
 178             t->nrefs = t->nexplicit;
 179             ta = t->explicit_texts;
 180             fa = t->explicit_fpos;
 181         }
 182         if (t->nrefs) {
 183             t->refs = snewn(t->nrefs, indexentry *);
 184             for (j = 0; j < t->nrefs; j++) {
 185                 indexentry *ent = snew(indexentry);
 186                 ent->text = *ta++;
 187                 ent->fpos = *fa++;
 188                 t->refs[j] = add234(i->entries, ent);
 189                 if (t->refs[j] != ent)     /* duplicate */
 190                     sfree(ent);
 191             }
 192         }
 193     }
 194 }
 195
 196 void cleanup_index(indexdata *i) {
 197     indextag *t;
 198     indexentry *ent;
 199     int ti;
 200
 201     for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
 202         sfree(t->name);
 203         free_word_list(t->implicit_text);
 204         sfree(t->explicit_texts);
 205         sfree(t->refs);
 206         sfree(t);
 207     }
 208     freetree234(i->tags);
 209     for (ti = 0; (ent = (indexentry *)index234(i->entries, ti))!=NULL; ti++) {
 210         sfree(ent);
 211     }
 212     freetree234(i->entries);
 213     sfree(i);
 214 }
 215
 216 static void dbg_prtwordlist(int level, word *w);
 217 static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text);
 218
 219 void index_debug(indexdata *i) {
 220     indextag *t;
 221     indexentry *y;
 222     int ti;
 223     int j;
 224
 225     printf("\nINDEX TAGS\n==========\n\n");
 226     for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
 227         printf("\n");
 228         if (t->implicit_text)
 229             dbg_prtmerge(0, t->name, t->implicit_text);
 230         for (j = 0; j < t->nexplicit; j++)
 231             dbg_prtmerge(1, t->name, t->explicit_texts[j]);
 232     }
 233
 234     printf("\nINDEX ENTRIES\n=============\n\n");
 235     for (ti = 0; (y = (indexentry *)index234(i->entries, ti)) != NULL; ti++) {
 236         printf("\n");
 237         printf("{\n");
 238         dbg_prtwordlist(1, y->text);
 239         printf("}\n");
 240     }
 241 }
 242
 243 static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text) {
 244     printf("\\IM: %splicit: \"", is_explicit ? "ex" : "im");
 245     for (; *tag; tag++)
 246         putchar(*tag);
 247     printf("\" {\n");
 248     dbg_prtwordlist(1, text);
 249     printf("}\n");
 250 }
 251
 252 static void dbg_prtwordlist(int level, word *w) {
 253     for (; w; w = w->next) {
 254         wchar_t *wp;
 255         printf("%*sword %d ", level*4, "", w->type);
 256         if (w->text) {
 257             printf("\"");
 258             for (wp = w->text; *wp; wp++)
 259                     putchar(*wp);
 260             printf("\"");
 261         } else
 262             printf("(no text)");
 263         if (w->alt) {
 264             printf(" alt = {\n");
 265             dbg_prtwordlist(level+1, w->alt);
 266             printf("%*s}", level*4, "");
 267         }
 268         printf("\n");
 269     }
 270 }