more tag normalization work

[disorder] / server / trackdb.c
diff --git a/server/trackdb.c b/server/trackdb.c

index cb6b45a..d1878c0 100644 (file)
--- a/server/trackdb.c
+++ b/server/trackdb.c
@@ -71,10 +71,14 @@ static char **trackdb_new_tid(int *ntracksp,
                                int maxtracks,
                                DB_TXN *tid);
  static int trackdb_expire_noticed_tid(time_t earliest, DB_TXN *tid);
+static char *normalize_tag(const char *s, size_t ns);
  
  const struct cache_type cache_files_type = { 86400 };
  unsigned long cache_files_hits, cache_files_misses;
  
+/** @brief Set by trackdb_open() */
+int trackdb_existing_database;
+
  /* setup and teardown ********************************************************/
  
  static const char *home;                /* home had better not change */
@@ -306,7 +310,7 @@ static DB *open_db(const char *path,
   * - @p TRACKDB_OPEN_FOR_UPGRADE, if this is disorder-dbupgrade
   */
  void trackdb_open(int flags) {
-  int newdb, err;
+  int err;
    pid_t pid;
  
    /* sanity checks */
@@ -359,14 +363,14 @@ void trackdb_open(int flags) {
        /* This doesn't make any sense */
        fatal(0, "database is already at current version");
      }
-    newdb = 0;
+    trackdb_existing_database = 1;
    } else {
      if(flags & TRACKDB_OPEN_FOR_UPGRADE) {
        /* Cannot upgrade a new database */
        fatal(0, "cannot upgrade a database that does not exist");
      }
      /* This is a brand new database */
-    newdb = 1;
+    trackdb_existing_database = 0;
    }
    /* open the databases */
    trackdb_tracksdb = open_db("tracks.db",
@@ -379,7 +383,7 @@ void trackdb_open(int flags) {
    trackdb_globaldb = open_db("global.db", 0, DB_HASH, DB_CREATE, 0666);
    trackdb_noticeddb = open_db("noticed.db",
                               DB_DUPSORT, DB_BTREE, DB_CREATE, 0666);
-  if(newdb) {
+  if(!trackdb_existing_database) {
      /* Stash the database version */
      char buf[32];
  
@@ -637,6 +641,22 @@ static int tailor_underscore_Word_Break_Other(uint32_t c) {
    }
  }
  
+/** @brief Remove all combining characters in-place
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @return New, possiblby reduced, length
+ */
+static size_t remove_combining_chars(uint32_t *s, size_t ns) {
+  uint32_t *start = s, *t = s, *end = s + ns;
+
+  while(s < end) {
+    const uint32_t c = *s++;
+    if(!utf32_combining_class(c))
+      *t++ = c;
+  }
+  return t - start;
+}
+
  /** @brief Normalize and split a string using a given tailoring */
  static void word_split(struct vector *v,
                         const char *s,
@@ -650,6 +670,8 @@ static void word_split(struct vector *v,
    /* Erase case distinctions */
    if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32)))
      return;
+  /* Drop combining characters */
+  nt32 = remove_combining_chars(t32, nt32);
    /* Split into words, treating _ as a space */
    w32 = utf32_word_split(t32, nt32, &nw, pt);
    /* Convert words back to UTF-8 and append to result */
@@ -657,6 +679,40 @@ static void word_split(struct vector *v,
      vector_append(v, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
  }
  
+/** @brief Normalize a tag
+ * @param s Tag
+ * @param ns Length of tag
+ * @return Normalized string or NULL on error
+ *
+ * The return value will be:
+ * - case-folded
+ * - have no leading or trailing space
+ * - have no combining characters
+ * - all spacing between words will be a single U+0020 SPACE
+ */
+static char *normalize_tag(const char *s, size_t ns) {
+  uint32_t *s32, **w32;
+  size_t ns32, nw32, i;
+  struct dynstr d[1];
+
+  if(!(s32 = utf8_to_utf32(s, ns, &ns32)))
+    return 0;
+  if(!(s32 = utf32_casefold_compat(s32, ns32, &ns32))) /* ->NFKD */
+    return 0;
+  ns32 = remove_combining_chars(s32, ns32);
+  /* Split into words, no Word_Break tailoring */
+  w32 = utf32_word_split(s32, ns32, &nw32, 0);
+  /* Compose back into a string */
+  dynstr_init(d);
+  for(i = 0; i < nw32; ++i) {
+    if(i)
+      dynstr_append(d, ' ');
+    dynstr_append_string(d, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
+  }
+  dynstr_terminate(d);
+  return d->vec;
+}
+
  /* compute the words of a track name */
  static char **track_to_words(const char *track,
                               const struct kvp *p) {
@@ -722,7 +778,8 @@ static char **parsetags(const char *s) {
        /* strip trailing spaces */
        while(s > t && s[-1] == ' ')
          --s;
-      vector_append(&v, xstrndup(t, s - t));
+      /* add tag to list */
+      vector_append(&v, normalize_tag(t, (size_t)(s - t)));
        /* skip intermediate and trailing separators */
        while(*s && (!tagchar(*s) || *s == ' '))
          ++s;
@@ -1044,11 +1101,43 @@ static int get_stats(struct vector *v,
    return 0;
  }
  
+/** @brief One entry in the search league */
  struct search_entry {
    char *word;
    int n;
  };
  
+/** @brief Add a word to the search league
+ * @param se Pointer to search league
+ * @param count Maximum size for search league
+ * @param nse Current size of search league
+ * @param word New word, or NULL
+ * @param n How often @p word appears
+ * @return New size of search league
+ */
+static int register_search_entry(struct search_entry *se,
+                                 int count,
+                                 int nse,
+                                 char *word,
+                                 int n) {
+  int i;
+
+  if(word && (nse < count || n > se[nse - 1].n)) {
+    /* Find the starting point */
+    if(nse == count)
+      i = nse - 1;
+    else
+      i = nse++;
+    /* Find the insertion point */
+    while(i > 0 && n > se[i - 1].n)
+      --i;
+    memmove(&se[i + 1], &se[i], (nse - i - 1) * sizeof *se);
+    se[i].word = word;
+    se[i].n = n;
+  }
+  return nse;
+}
+
  /* find the top COUNT words in the search database */
  static int search_league(struct vector *v, int count, DB_TXN *tid) {
    struct search_entry *se;
@@ -1061,25 +1150,14 @@ static int search_league(struct vector *v, int count, DB_TXN *tid) {
  
    cursor = trackdb_opencursor(trackdb_searchdb, tid);
    se = xmalloc(count * sizeof *se);
+  /* Walk across the whole database counting up the number of times each
+   * word appears. */
    while(!(err = cursor->c_get(cursor, prepare_data(&k), prepare_data(&d),
                                DB_NEXT))) {
      if(word && wl == k.size && !strncmp(word, k.data, wl))
-      ++n;
+      ++n;                              /* same word again */
      else {
-#define FINALIZE() do {                                                \
-  if(word && (nse < count || n > se[nse - 1].n)) {             \
-    if(nse == count)                                           \
-      i = nse - 1;                                             \
-    else                                                       \
-      i = nse++;                                               \
-    while(i > 0 && n > se[i - 1].n)                            \
-      --i;                                                     \
-    memmove(&se[i + 1], &se[i], (nse - i) * sizeof *se);       \
-    se[i].word = word;                                         \
-    se[i].n = n;                                               \
-  }                                                            \
-} while(0)
-      FINALIZE();
+      nse = register_search_entry(se, count, nse, word, n);
        word = xstrndup(k.data, wl = k.size);
        n = 1;
      }
@@ -1096,7 +1174,7 @@ static int search_league(struct vector *v, int count, DB_TXN *tid) {
    }
    if(trackdb_closecursor(cursor)) err = DB_LOCK_DEADLOCK;
    if(err) return err;
-  FINALIZE();
+  nse = register_search_entry(se, count, nse, word, n);
    byte_xasprintf(&str, "Top %d search words:", nse);
    vector_append(v, str);
    for(i = 0; i < nse; ++i) {
@@ -1149,7 +1227,7 @@ struct stats_details {
  
  static void stats_complete(struct stats_details *d) {
    char *s;
-  
+
    if(!(d->exited && d->closed))
      return;
    byte_xasprintf(&s, "\n"
@@ -1805,6 +1883,7 @@ static const char *checktag(const char *s) {
  char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) {
    const char **w, *best = 0, *tag;
    char **twords, **tags;
+  char *istag;
    int i, j, n, err, what;
    DBC *cursor = 0;
    DBT k, d;
@@ -1816,15 +1895,32 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) {
    const char *dbname;
  
    *ntracks = 0;                                /* for early returns */
-  /* casefold all the words */
+  /* normalize all the words */
    w = xmalloc(nwordlist * sizeof (char *));
+  istag = xmalloc_noptr(nwordlist);
    for(n = 0; n < nwordlist; ++n) {
+    uint32_t *w32;
+    size_t nw32;
+    
      w[n] = utf8_casefold_compat(wordlist[n], strlen(wordlist[n]), 0);
-    if(checktag(w[n])) ++ntags;         /* count up tags */
+    if(checktag(w[n])) {
+      ++ntags;         /* count up tags */
+      /* Normalize the tag */
+      w[n] = normalize_tag(w[n] + 4, strlen(w[n] + 4));
+      istag[n] = 1;
+    } else {
+      /* Normalize the search term by removing combining characters */
+      if(!(w32 = utf8_to_utf32(w[n], strlen(w[n]), &nw32)))
+        return 0;
+      nw32 = remove_combining_chars(w32, nw32);
+      if(!(w[n] = utf32_to_utf8(w32, nw32, 0)))
+        return 0;
+      istag[n] = 0;
+    }
    }
    /* find the longest non-stopword */
    for(n = 0; n < nwordlist; ++n)
-    if(!stopword(w[n]) && !checktag(w[n]))
+    if(!istag[n] && !stopword(w[n]))
        if(!best || strlen(w[n]) > strlen(best))
         best = w[n];
    /* TODO: we should at least in principal be able to identify the word or tag
@@ -1833,7 +1929,7 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) {
    if(ntags && !best) {
      /* Only tags are listed.  We limit to the first and narrow down with the
       * rest. */
-    best = checktag(w[0]);
+    best = istag[0] ? w[0] : 0;
      db = trackdb_tagsdb;
      dbname = "tags";
    } else if(best) {
@@ -1882,7 +1978,8 @@ char **trackdb_search(char **wordlist, int nwordlist, int *ntracks) {
        twords = track_to_words(v.vec[n], p);
        tags = parsetags(kvp_get(p, "tags"));
        for(i = 0; i < nwordlist; ++i) {
-        if((tag = checktag(w[i]))) {
+        if(istag[i]) {
+          tag = w[i];
            /* Track must have this tag */
            for(j = 0; tags[j]; ++j)
              if(!strcmp(tag, tags[j])) break; /* tag found */