[sgt/halibut] / index.c

/*
 * index.c: create and collate index data structures
 */

#include <stdio.h>
#include <stdlib.h>
#include "halibut.h"

static int compare_tags(void *av, void *bv);
static int compare_entries(void *av, void *bv);

indexdata *make_index(void) {
    indexdata *ret = mknew(indexdata);
    ret->tags = newtree234(compare_tags);
    ret->entries = newtree234(compare_entries);
    return ret;
}

static indextag *make_indextag(void) {
    indextag *ret = mknew(indextag);
    ret->name = NULL;
    ret->implicit_text = NULL;
    ret->explicit_texts = NULL;
    ret->nexplicit = ret->explicit_size = ret->nrefs = 0;
    ret->refs = NULL;
    return ret;
}

static int compare_tags(void *av, void *bv) {
    indextag *a = (indextag *)av, *b = (indextag *)bv;
    return ustricmp(a->name, b->name);
}

static int compare_to_find_tag(void *av, void *bv) {
    wchar_t *a = (wchar_t *)av;
    indextag *b = (indextag *)bv;
    return ustricmp(a, b->name);
}

static int compare_entries(void *av, void *bv) {
    indexentry *a = (indexentry *)av, *b = (indexentry *)bv;
    return compare_wordlists(a->text, b->text);    
}

/*
 * Back-end utility: find the indextag with a given name.
 */
indextag *index_findtag(indexdata *idx, wchar_t *name) {
    return find234(idx->tags, name, compare_to_find_tag);
}

/*
 * Add a \IM. `tags' points to a zero-terminated chain of
 * zero-terminated strings ("first\0second\0thirdandlast\0\0").
 * `text' points to a word list.
 *
 * Guarantee on calling sequence: all implicit merges are given
 * before the explicit ones.
 */
void index_merge(indexdata *idx, int is_explicit, wchar_t *tags, word *text) {
    indextag *t, *existing;

    /*
     * For an implicit merge, we want to remove all emphasis,
     * because the chances are that the user didn't really want to
     * index the term as emphasised.
     */
    {
	word *w;

	for (w = text; w; w = w->next) {
	    if (w->type == word_Emph)
		w->type = word_Normal;
	    else if (w->type == word_EmphSpace)
		w->type = word_WhiteSpace;
	    else if (w->type == word_EmphQuote)
		w->type = word_Quote;
	}
    }

    /*
     * FIXME: want to warn on overlapping source sets.
     */
    for (; *tags; tags = uadv(tags)) {
	t = make_indextag();
	t->name = tags;
	existing = add234(idx->tags, t);
	if (existing == t) {
	    /*
	     * Duplicate this so we can free it independently.
	     */
	    t->name = ustrdup(tags);

	    /*
	     * Every tag has an implicit \IM. So if this tag
	     * doesn't exist and we're explicit, then we should
	     * warn (and drop it, since it won't be referenced).
	     */
	    if (is_explicit) {
		error(err_nosuchidxtag, tags);
		continue;
	    }

	    /*
	     * Otherwise, this is a new tag with an implicit \IM.
	     */
	    t->implicit_text = text;
	} else {
	    sfree(t);
	    t = existing;
	    if (!is_explicit) {
 		/*
		 * An implicit \IM for a tag that's had an implicit
		 * \IM before. FIXME: we should check the text
		 * against the existing text and warn on
		 * differences. And check the tag for case match
		 * against the existing tag, likewise.
		 */
	    } else {
		/*
		 * An explicit \IM added to a valid tag. In
		 * particular, this removes the implicit \IM if
		 * present.
		 */
		if (t->implicit_text) {
		    free_word_list(t->implicit_text);
		    t->implicit_text = NULL;
		}
		if (t->nexplicit >= t->explicit_size) {
		    t->explicit_size = t->nexplicit + 8;
		    t->explicit_texts = resize(t->explicit_texts,
					       t->explicit_size);
		}
		t->explicit_texts[t->nexplicit++] = text;
	    }
	}
    }
}

/*
 * Build the final-form index. We now have every tag, with every
 * \IM, set up in a 2-3 tree indexed by tag. We now want to collate
 * the RHSes of the \IMs, and sort by final form, and decorate the
 * entries in the original 2-3 tree with pointers to the RHS
 * entries.
 */
void build_index(indexdata *i) {
    indextag *t;
    word **ta;
    int ti;
    int j;

    for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
	if (t->implicit_text) {
	    t->nrefs = 1;
	    ta = &t->implicit_text;
	} else {
	    t->nrefs = t->nexplicit;
	    ta = t->explicit_texts;
	}
	if (t->nrefs) {
	    t->refs = mknewa(indexentry *, t->nrefs);
	    for (j = 0; j < t->nrefs; j++) {
		indexentry *ent = mknew(indexentry);
		ent->text = *ta++;
		t->refs[j] = add234(i->entries, ent);
		if (t->refs[j] != ent)     /* duplicate */
		    sfree(ent);
	    }
	}
    }
}

void cleanup_index(indexdata *i) {
    indextag *t;
    indexentry *ent;
    int ti;

    for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
	sfree(t->name);
	free_word_list(t->implicit_text);
	sfree(t->explicit_texts);
	sfree(t->refs);
	sfree(t);
    }
    freetree234(i->tags);
    for (ti = 0; (ent = (indexentry *)index234(i->entries, ti))!=NULL; ti++) {
	sfree(ent);
    }
    freetree234(i->entries);
    sfree(i);
}

static void dbg_prtwordlist(int level, word *w);
static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text);

void index_debug(indexdata *i) {
    indextag *t;
    indexentry *y;
    int ti;
    int j;

    printf("\nINDEX TAGS\n==========\n\n");
    for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
        printf("\n");
	if (t->implicit_text)
	    dbg_prtmerge(0, t->name, t->implicit_text);
	for (j = 0; j < t->nexplicit; j++)
	    dbg_prtmerge(1, t->name, t->explicit_texts[j]);
    }

    printf("\nINDEX ENTRIES\n=============\n\n");
    for (ti = 0; (y = (indexentry *)index234(i->entries, ti)) != NULL; ti++) {
        printf("\n");
	printf("{\n");
	dbg_prtwordlist(1, y->text);
	printf("}\n");
    }
}

static void dbg_prtmerge(int is_explicit, wchar_t *tag, word *text) {
    printf("\\IM: %splicit: \"", is_explicit ? "ex" : "im");
    for (; *tag; tag++)
	putchar(*tag);
    printf("\" {\n");
    dbg_prtwordlist(1, text);
    printf("}\n");
}

static void dbg_prtwordlist(int level, word *w) {
    for (; w; w = w->next) {
	wchar_t *wp;
	printf("%*sword %d ", level*4, "", w->type);
	if (w->text) {
	    printf("\"");
	    for (wp = w->text; *wp; wp++)
		    putchar(*wp);
	    printf("\"");
	} else
	    printf("(no text)");
	if (w->alt) {
	    printf(" alt = {\n");
	    dbg_prtwordlist(level+1, w->alt);
	    printf("%*s}", level*4, "");
	}
	printf("\n");
    }
}
Commit	Line	Data
d7482997	1	/*
	2	* index.c: create and collate index data structures
	3	*/
	4
	5	#include <stdio.h>
	6	#include <stdlib.h>
	7	#include "halibut.h"
	8
	9	static int compare_tags(void av, void bv);
	10	static int compare_entries(void av, void bv);
	11
	12	indexdata *make_index(void) {
	13	indexdata *ret = mknew(indexdata);
	14	ret->tags = newtree234(compare_tags);
	15	ret->entries = newtree234(compare_entries);
	16	return ret;
	17	}
	18
	19	static indextag *make_indextag(void) {
	20	indextag *ret = mknew(indextag);
	21	ret->name = NULL;
	22	ret->implicit_text = NULL;
	23	ret->explicit_texts = NULL;
	24	ret->nexplicit = ret->explicit_size = ret->nrefs = 0;
	25	ret->refs = NULL;
	26	return ret;
	27	}
	28
	29	static int compare_tags(void av, void bv) {
	30	indextag a = (indextag )av, b = (indextag )bv;
	31	return ustricmp(a->name, b->name);
	32	}
	33
	34	static int compare_to_find_tag(void av, void bv) {
	35	wchar_t a = (wchar_t )av;
	36	indextag b = (indextag )bv;
	37	return ustricmp(a, b->name);
	38	}
	39
	40	static int compare_entries(void av, void bv) {
	41	indexentry a = (indexentry )av, b = (indexentry )bv;
	42	return compare_wordlists(a->text, b->text);
	43	}
	44
	45	/*
	46	* Back-end utility: find the indextag with a given name.
	47	*/
	48	indextag index_findtag(indexdata idx, wchar_t *name) {
	49	return find234(idx->tags, name, compare_to_find_tag);
	50	}
	51
	52	/*
	53	* Add a \IM. `tags' points to a zero-terminated chain of
	54	* zero-terminated strings ("first\0second\0thirdandlast\0\0").
	55	* `text' points to a word list.
	56	*
	57	* Guarantee on calling sequence: all implicit merges are given
	58	* before the explicit ones.
	59	*/
	60	void index_merge(indexdata idx, int is_explicit, wchar_t tags, word *text) {
	61	indextag t, existing;
	62
	63	/*
8856f150	64	* For an implicit merge, we want to remove all emphasis,
	65	* because the chances are that the user didn't really want to
	66	* index the term as emphasised.
	67	*/
	68	{
	69	word *w;
	70
	71	for (w = text; w; w = w->next) {
	72	if (w->type == word_Emph)
	73	w->type = word_Normal;
	74	else if (w->type == word_EmphSpace)
	75	w->type = word_WhiteSpace;
	76	else if (w->type == word_EmphQuote)
	77	w->type = word_Quote;
	78	}
	79	}
	80
	81	/*
d7482997	82	* FIXME: want to warn on overlapping source sets.
	83	*/
	84	for (; *tags; tags = uadv(tags)) {
	85	t = make_indextag();
	86	t->name = tags;
	87	existing = add234(idx->tags, t);
	88	if (existing == t) {
	89	/*
	90	* Duplicate this so we can free it independently.
	91	*/
	92	t->name = ustrdup(tags);
	93
	94	/*
	95	* Every tag has an implicit \IM. So if this tag
	96	* doesn't exist and we're explicit, then we should
	97	* warn (and drop it, since it won't be referenced).
	98	*/
	99	if (is_explicit) {
	100	error(err_nosuchidxtag, tags);
	101	continue;
	102	}
	103
	104	/*
	105	* Otherwise, this is a new tag with an implicit \IM.
	106	*/
	107	t->implicit_text = text;
	108	} else {
	109	sfree(t);
	110	t = existing;
	111	if (!is_explicit) {
	112	/*
	113	* An implicit \IM for a tag that's had an implicit
	114	* \IM before. FIXME: we should check the text
	115	* against the existing text and warn on
	116	* differences. And check the tag for case match
	117	* against the existing tag, likewise.
	118	*/
	119	} else {
	120	/*
	121	* An explicit \IM added to a valid tag. In
	122	* particular, this removes the implicit \IM if
	123	* present.
	124	*/
	125	if (t->implicit_text) {
	126	free_word_list(t->implicit_text);
	127	t->implicit_text = NULL;
	128	}
	129	if (t->nexplicit >= t->explicit_size) {
	130	t->explicit_size = t->nexplicit + 8;
	131	t->explicit_texts = resize(t->explicit_texts,
	132	t->explicit_size);
	133	}
	134	t->explicit_texts[t->nexplicit++] = text;
	135	}
	136	}
	137	}
	138	}
	139
	140	/*
	141	* Build the final-form index. We now have every tag, with every
	142	* \IM, set up in a 2-3 tree indexed by tag. We now want to collate
	143	* the RHSes of the \IMs, and sort by final form, and decorate the
	144	* entries in the original 2-3 tree with pointers to the RHS
	145	* entries.
146	*/
147	void build_index(indexdata *i) {
148	indextag *t;
149	word **ta;
150	int ti;
151	int j;
152
153	for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
154	if (t->implicit_text) {
155	t->nrefs = 1;
156	ta = &t->implicit_text;
157	} else {
158	t->nrefs = t->nexplicit;
159	ta = t->explicit_texts;
160	}
161	if (t->nrefs) {
162	t->refs = mknewa(indexentry *, t->nrefs);
163	for (j = 0; j < t->nrefs; j++) {
164	indexentry *ent = mknew(indexentry);
165	ent->text = *ta++;
166	t->refs[j] = add234(i->entries, ent);
167	if (t->refs[j] != ent) /* duplicate */
168	sfree(ent);
169	}
170	}
171	}
172	}
173
174	void cleanup_index(indexdata *i) {
175	indextag *t;
176	indexentry *ent;
177	int ti;
178
179	for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
180	sfree(t->name);
181	free_word_list(t->implicit_text);
182	sfree(t->explicit_texts);
183	sfree(t->refs);
184	sfree(t);
185	}
186	freetree234(i->tags);
187	for (ti = 0; (ent = (indexentry *)index234(i->entries, ti))!=NULL; ti++) {
188	sfree(ent);
189	}
190	freetree234(i->entries);
191	sfree(i);
192	}
193
194	static void dbg_prtwordlist(int level, word *w);
195	static void dbg_prtmerge(int is_explicit, wchar_t tag, word text);
196
197	void index_debug(indexdata *i) {
198	indextag *t;
199	indexentry *y;
200	int ti;
201	int j;
202
203	printf("\nINDEX TAGS\n==========\n\n");
204	for (ti = 0; (t = (indextag *)index234(i->tags, ti)) != NULL; ti++) {
205	printf("\n");
206	if (t->implicit_text)
207	dbg_prtmerge(0, t->name, t->implicit_text);
208	for (j = 0; j < t->nexplicit; j++)
209	dbg_prtmerge(1, t->name, t->explicit_texts[j]);
210	}
211
212	printf("\nINDEX ENTRIES\n=============\n\n");
213	for (ti = 0; (y = (indexentry *)index234(i->entries, ti)) != NULL; ti++) {
214	printf("\n");
215	printf("{\n");
216	dbg_prtwordlist(1, y->text);
217	printf("}\n");
218	}
219	}
220
221	static void dbg_prtmerge(int is_explicit, wchar_t tag, word text) {
222	printf("\\IM: %splicit: \"", is_explicit ? "ex" : "im");
223	for (; *tag; tag++)
224	putchar(*tag);
225	printf("\" {\n");
226	dbg_prtwordlist(1, text);
227	printf("}\n");
228	}
229
230	static void dbg_prtwordlist(int level, word *w) {
231	for (; w; w = w->next) {
232	wchar_t *wp;
233	printf("%sword %d ", level4, "", w->type);
234	if (w->text) {
235	printf("\"");
236	for (wp = w->text; *wp; wp++)
237	putchar(*wp);
238	printf("\"");
239	} else
240	printf("(no text)");
241	if (w->alt) {
242	printf(" alt = {\n");
243	dbg_prtwordlist(level+1, w->alt);
244	printf("%s}", level4, "");
245	}
246	printf("\n");
247	}
248	}