int maxtracks,
DB_TXN *tid);
static int trackdb_expire_noticed_tid(time_t earliest, DB_TXN *tid);
+static char *normalize_tag(const char *s, size_t ns);
const struct cache_type cache_files_type = { 86400 };
unsigned long cache_files_hits, cache_files_misses;
+/** @brief Set by trackdb_open() */
+int trackdb_existing_database;
+
/* setup and teardown ********************************************************/
static const char *home; /* home had better not change */
* - @p TRACKDB_OPEN_FOR_UPGRADE, if this is disorder-dbupgrade
*/
void trackdb_open(int flags) {
- int newdb, err;
+ int err;
pid_t pid;
/* sanity checks */
/* This doesn't make any sense */
fatal(0, "database is already at current version");
}
- newdb = 0;
+ trackdb_existing_database = 1;
} else {
if(flags & TRACKDB_OPEN_FOR_UPGRADE) {
/* Cannot upgrade a new database */
fatal(0, "cannot upgrade a database that does not exist");
}
/* This is a brand new database */
- newdb = 1;
+ trackdb_existing_database = 0;
}
/* open the databases */
trackdb_tracksdb = open_db("tracks.db",
trackdb_globaldb = open_db("global.db", 0, DB_HASH, DB_CREATE, 0666);
trackdb_noticeddb = open_db("noticed.db",
DB_DUPSORT, DB_BTREE, DB_CREATE, 0666);
- if(newdb) {
+ if(!trackdb_existing_database) {
/* Stash the database version */
char buf[32];
}
}
+/** @brief Remove all combining characters in-place
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @return New, possiblby reduced, length
+ */
+static size_t remove_combining_chars(uint32_t *s, size_t ns) {
+ uint32_t *start = s, *t = s, *end = s + ns;
+
+ while(s < end) {
+ const uint32_t c = *s++;
+ if(!utf32_combining_class(c))
+ *t++ = c;
+ }
+ return t - start;
+}
+
/** @brief Normalize and split a string using a given tailoring */
static void word_split(struct vector *v,
const char *s,
/* Erase case distinctions */
if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32)))
return;
+ /* Drop combining characters */
+ nt32 = remove_combining_chars(t32, nt32);
/* Split into words, treating _ as a space */
w32 = utf32_word_split(t32, nt32, &nw, pt);
/* Convert words back to UTF-8 and append to result */
vector_append(v, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
}
+/** @brief Normalize a tag
+ * @param s Tag
+ * @param ns Length of tag
+ * @return Normalized string or NULL on error
+ *
+ * The return value will be:
+ * - case-folded
+ * - have no leading or trailing space
+ * - have no combining characters
+ * - all spacing between words will be a single U+0020 SPACE
+ */
+static char *normalize_tag(const char *s, size_t ns) {
+ uint32_t *s32, **w32;
+ size_t ns32, nw32, i;
+ struct dynstr d[1];
+
+ if(!(s32 = utf8_to_utf32(s, ns, &ns32)))
+ return 0;
+ if(!(s32 = utf32_casefold_compat(s32, ns32, &ns32))) /* ->NFKD */
+ return 0;
+ ns32 = remove_combining_chars(s32, ns32);
+ /* Split into words, no Word_Break tailoring */
+ w32 = utf32_word_split(s32, ns32, &nw32, 0);
+ /* Compose back into a string */
+ dynstr_init(d);
+ for(i = 0; i < nw32; ++i) {
+ if(i)
+ dynstr_append(d, ' ');
+ dynstr_append_string(d, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
+ }
+ dynstr_terminate(d);
+ return d->vec;
+}
+
/* compute the words of a track name */
static char **track_to_words(const char *track,
const struct kvp *p) {
/* strip trailing spaces */
while(s > t && s[-1] == ' ')
--s;
- vector_append(&v, xstrndup(t, s - t));
+ /* add tag to list */
+ vector_append(&v, normalize_tag(t, (size_t)(s - t)));
/* skip intermediate and trailing separators */
while(*s && (!tagchar(*s) || *s == ' '))
++s;
return 0;
}
+/** @brief One entry in the search league */
struct search_entry {
char *word;
int n;
};
+/** @brief Add a word to the search league
+ * @param se Pointer to search league
+ * @param count Maximum size for search league
+ * @param nse Current size of search league
+ * @param word New word, or NULL
+ * @param n How often @p word appears
+ * @return New size of search league
+ */
+static int register_search_entry(struct search_entry *se,
+ int count,
+ int nse,
+ char *word,
+ int n) {
+ int i;
+
+ if(word && (nse < count || n > se[nse - 1].n)) {
+ /* Find the starting point */
+ if(nse == count)
+ i = nse - 1;
+ else
+ i = nse++;
+ /* Find the insertion point */
+ while(i > 0 && n > se[i - 1].n)
+ --i;
+ memmove(&se[i + 1], &se[i], (nse - i - 1) * sizeof *se);
+ se[i].word = word;
+ se[i].n = n;
+ }
+ return nse;
+}
+
/* find the top COUNT words in the search database */
static int search_league(struct vector *v, int count, DB_TXN *tid) {
struct search_entry *se;
cursor = trackdb_opencursor(trackdb_searchdb, tid);
se = xmalloc(count * sizeof *se);
+ /* Walk across the whole database counting up the number of times each
+ * word appears. */
while(!(err = cursor->c_get(cursor, prepare_data(&k), prepare_data(&d),
DB_NEXT))) {
if(word && wl == k.size && !strncmp(word, k.data, wl))
- ++n;
+ ++n; /* same word again */
else {
-#define FINALIZE() do { \
- if(word && (nse < count || n > se[nse - 1].n)) { \
- if(nse == count) \
- i = nse - 1; \
- else \
- i = nse++; \
- while(i > 0 && n > se[i - 1].n) \
- --i; \
- memmove(&se[i + 1], &se[i], (nse - i) * sizeof *se); \
- se[i].word = word; \
- se[i].n = n; \
- } \
-} while(0)
- FINALIZE();
+ nse = register_search_entry(se, count, nse, word, n);
word = xstrndup(k.data, wl = k.size);
n = 1;
}
}
if(trackdb_closecursor(cursor)) err = DB_LOCK_DEADLOCK;
if(err) return err;
- FINALIZE();
+ nse = register_search_entry(se, count, nse, word, n);
byte_xasprintf(&str, "Top %d search words:", nse);
vector_append(v, str);
for(i = 0; i < nse; ++i) {
static void stats_complete(struct stats_details *d) {
char *s;
-
+
if(!(d->exited && d->closed))
return;
byte_xasprintf(&s, "\n"
char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) {
const char **w, *best = 0, *tag;
char **twords, **tags;
+ char *istag;
int i, j, n, err, what;
DBC *cursor = 0;
DBT k, d;
const char *dbname;
*ntracks = 0; /* for early returns */
- /* casefold all the words */
+ /* normalize all the words */
w = xmalloc(nwordlist * sizeof (char *));
+ istag = xmalloc_noptr(nwordlist);
for(n = 0; n < nwordlist; ++n) {
+ uint32_t *w32;
+ size_t nw32;
+
w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0);
- if(checktag(w[n])) ++ntags; /* count up tags */
+ if(checktag(w[n])) {
+ ++ntags; /* count up tags */
+ /* Normalize the tag */
+ w[n] = normalize_tag(w[n] + 4, strlen(w[n] + 4));
+ istag[n] = 1;
+ } else {
+ /* Normalize the search term by removing combining characters */
+ if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32)))
+ return 0;
+ nw32 = remove_combining_chars(w32, nw32);
+ if(!(w[n] = utf32_to_utf8(w32, nw32, 0)))
+ return 0;
+ istag[n] = 0;
+ }
}
/* find the longest non-stopword */
for(n = 0; n < nwordlist; ++n)
- if(!stopword(w[n]) && !checktag(w[n]))
+ if(!istag[n] && !stopword(w[n]))
if(!best || strlen(w[n]) > strlen(best))
best = w[n];
/* TODO: we should at least in principal be able to identify the word or tag
if(ntags && !best) {
/* Only tags are listed. We limit to the first and narrow down with the
* rest. */
- best = checktag(w[0]);
+ best = istag[0] ? w[0] : 0;
db = trackdb_tagsdb;
dbname = "tags";
} else if(best) {
twords = track_to_words(v.vec[n], p);
tags = parsetags(kvp_get(p, "tags"));
for(i = 0; i < nwordlist; ++i) {
- if((tag = checktag(w[i]))) {
+ if(istag[i]) {
+ tag = w[i];
/* Track must have this tag */
for(j = 0; tags[j]; ++j)
if(!strcmp(tag, tags[j])) break; /* tag found */