X-Git-Url: https://git.distorted.org.uk/~mdw/disorder/blobdiff_plain/4ecbdbd99dea3236c3c6d5ea5401a08c56de5d3c..d1d4a182d95ffeb0fc607c90842256d9b4ab7c43:/server/trackdb.c diff --git a/server/trackdb.c b/server/trackdb.c index bc526ca..d1878c0 100644 --- a/server/trackdb.c +++ b/server/trackdb.c @@ -71,10 +71,14 @@ static char **trackdb_new_tid(int *ntracksp, int maxtracks, DB_TXN *tid); static int trackdb_expire_noticed_tid(time_t earliest, DB_TXN *tid); +static char *normalize_tag(const char *s, size_t ns); const struct cache_type cache_files_type = { 86400 }; unsigned long cache_files_hits, cache_files_misses; +/** @brief Set by trackdb_open() */ +int trackdb_existing_database; + /* setup and teardown ********************************************************/ static const char *home; /* home had better not change */ @@ -306,7 +310,7 @@ static DB *open_db(const char *path, * - @p TRACKDB_OPEN_FOR_UPGRADE, if this is disorder-dbupgrade */ void trackdb_open(int flags) { - int newdb, err; + int err; pid_t pid; /* sanity checks */ @@ -359,14 +363,14 @@ void trackdb_open(int flags) { /* This doesn't make any sense */ fatal(0, "database is already at current version"); } - newdb = 0; + trackdb_existing_database = 1; } else { if(flags & TRACKDB_OPEN_FOR_UPGRADE) { /* Cannot upgrade a new database */ fatal(0, "cannot upgrade a database that does not exist"); } /* This is a brand new database */ - newdb = 1; + trackdb_existing_database = 0; } /* open the databases */ trackdb_tracksdb = open_db("tracks.db", @@ -379,7 +383,7 @@ void trackdb_open(int flags) { trackdb_globaldb = open_db("global.db", 0, DB_HASH, DB_CREATE, 0666); trackdb_noticeddb = open_db("noticed.db", DB_DUPSORT, DB_BTREE, DB_CREATE, 0666); - if(newdb) { + if(!trackdb_existing_database) { /* Stash the database version */ char buf[32]; @@ -675,6 +679,40 @@ static void word_split(struct vector *v, vector_append(v, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0)); } +/** @brief Normalize a tag + * @param s Tag + * @param ns Length of tag + * @return Normalized string or NULL on error + * + * The return value will be: + * - case-folded + * - have no leading or trailing space + * - have no combining characters + * - all spacing between words will be a single U+0020 SPACE + */ +static char *normalize_tag(const char *s, size_t ns) { + uint32_t *s32, **w32; + size_t ns32, nw32, i; + struct dynstr d[1]; + + if(!(s32 = utf8_to_utf32(s, ns, &ns32))) + return 0; + if(!(s32 = utf32_casefold_compat(s32, ns32, &ns32))) /* ->NFKD */ + return 0; + ns32 = remove_combining_chars(s32, ns32); + /* Split into words, no Word_Break tailoring */ + w32 = utf32_word_split(s32, ns32, &nw32, 0); + /* Compose back into a string */ + dynstr_init(d); + for(i = 0; i < nw32; ++i) { + if(i) + dynstr_append(d, ' '); + dynstr_append_string(d, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0)); + } + dynstr_terminate(d); + return d->vec; +} + /* compute the words of a track name */ static char **track_to_words(const char *track, const struct kvp *p) { @@ -740,7 +778,8 @@ static char **parsetags(const char *s) { /* strip trailing spaces */ while(s > t && s[-1] == ' ') --s; - vector_append(&v, xstrndup(t, s - t)); + /* add tag to list */ + vector_append(&v, normalize_tag(t, (size_t)(s - t))); /* skip intermediate and trailing separators */ while(*s && (!tagchar(*s) || *s == ' ')) ++s; @@ -1062,11 +1101,43 @@ static int get_stats(struct vector *v, return 0; } +/** @brief One entry in the search league */ struct search_entry { char *word; int n; }; +/** @brief Add a word to the search league + * @param se Pointer to search league + * @param count Maximum size for search league + * @param nse Current size of search league + * @param word New word, or NULL + * @param n How often @p word appears + * @return New size of search league + */ +static int register_search_entry(struct search_entry *se, + int count, + int nse, + char *word, + int n) { + int i; + + if(word && (nse < count || n > se[nse - 1].n)) { + /* Find the starting point */ + if(nse == count) + i = nse - 1; + else + i = nse++; + /* Find the insertion point */ + while(i > 0 && n > se[i - 1].n) + --i; + memmove(&se[i + 1], &se[i], (nse - i - 1) * sizeof *se); + se[i].word = word; + se[i].n = n; + } + return nse; +} + /* find the top COUNT words in the search database */ static int search_league(struct vector *v, int count, DB_TXN *tid) { struct search_entry *se; @@ -1079,25 +1150,14 @@ static int search_league(struct vector *v, int count, DB_TXN *tid) { cursor = trackdb_opencursor(trackdb_searchdb, tid); se = xmalloc(count * sizeof *se); + /* Walk across the whole database counting up the number of times each + * word appears. */ while(!(err = cursor->c_get(cursor, prepare_data(&k), prepare_data(&d), DB_NEXT))) { if(word && wl == k.size && !strncmp(word, k.data, wl)) - ++n; + ++n; /* same word again */ else { -#define FINALIZE() do { \ - if(word && (nse < count || n > se[nse - 1].n)) { \ - if(nse == count) \ - i = nse - 1; \ - else \ - i = nse++; \ - while(i > 0 && n > se[i - 1].n) \ - --i; \ - memmove(&se[i + 1], &se[i], (nse - i) * sizeof *se); \ - se[i].word = word; \ - se[i].n = n; \ - } \ -} while(0) - FINALIZE(); + nse = register_search_entry(se, count, nse, word, n); word = xstrndup(k.data, wl = k.size); n = 1; } @@ -1114,7 +1174,7 @@ static int search_league(struct vector *v, int count, DB_TXN *tid) { } if(trackdb_closecursor(cursor)) err = DB_LOCK_DEADLOCK; if(err) return err; - FINALIZE(); + nse = register_search_entry(se, count, nse, word, n); byte_xasprintf(&str, "Top %d search words:", nse); vector_append(v, str); for(i = 0; i < nse; ++i) { @@ -1167,7 +1227,7 @@ struct stats_details { static void stats_complete(struct stats_details *d) { char *s; - + if(!(d->exited && d->closed)) return; byte_xasprintf(&s, "\n" @@ -1823,6 +1883,7 @@ static const char *checktag(const char *s) { char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { const char **w, *best = 0, *tag; char **twords, **tags; + char *istag; int i, j, n, err, what; DBC *cursor = 0; DBT k, d; @@ -1836,22 +1897,30 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { *ntracks = 0; /* for early returns */ /* normalize all the words */ w = xmalloc(nwordlist * sizeof (char *)); + istag = xmalloc_noptr(nwordlist); for(n = 0; n < nwordlist; ++n) { uint32_t *w32; size_t nw32; w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0); - if(checktag(w[n])) ++ntags; /* count up tags */ - /* Strip out combining characters (AFTER checking whether it's a tag) */ - if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32))) - return 0; - nw32 = remove_combining_chars(w32, nw32); - if(!(w[n] = utf32_to_utf8(w32, nw32, 0))) - return 0; + if(checktag(w[n])) { + ++ntags; /* count up tags */ + /* Normalize the tag */ + w[n] = normalize_tag(w[n] + 4, strlen(w[n] + 4)); + istag[n] = 1; + } else { + /* Normalize the search term by removing combining characters */ + if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32))) + return 0; + nw32 = remove_combining_chars(w32, nw32); + if(!(w[n] = utf32_to_utf8(w32, nw32, 0))) + return 0; + istag[n] = 0; + } } /* find the longest non-stopword */ for(n = 0; n < nwordlist; ++n) - if(!stopword(w[n]) && !checktag(w[n])) + if(!istag[n] && !stopword(w[n])) if(!best || strlen(w[n]) > strlen(best)) best = w[n]; /* TODO: we should at least in principal be able to identify the word or tag @@ -1860,7 +1929,7 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { if(ntags && !best) { /* Only tags are listed. We limit to the first and narrow down with the * rest. */ - best = checktag(w[0]); + best = istag[0] ? w[0] : 0; db = trackdb_tagsdb; dbname = "tags"; } else if(best) { @@ -1909,7 +1978,8 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) { twords = track_to_words(v.vec[n], p); tags = parsetags(kvp_get(p, "tags")); for(i = 0; i < nwordlist; ++i) { - if((tag = checktag(w[i]))) { + if(istag[i]) { + tag = w[i]; /* Track must have this tag */ for(j = 0; tags[j]; ++j) if(!strcmp(tag, tags[j])) break; /* tag found */