+/** @brief Word_Break property tailor that treats underscores as spaces */
+static int tailor_underscore_Word_Break_Other(uint32_t c) {
+ switch(c) {
+ default:
+ return -1;
+ case 0x005F: /* LOW LINE (SPACING UNDERSCORE) */
+ return unicode_Word_Break_Other;
+ }
+}
+
+/** @brief Remove all combining characters in-place
+ * @param s Pointer to start of string
+ * @param ns Length of string
+ * @return New, possiblby reduced, length
+ */
+static size_t remove_combining_chars(uint32_t *s, size_t ns) {
+ uint32_t *start = s, *t = s, *end = s + ns;
+
+ while(s < end) {
+ const uint32_t c = *s++;
+ if(!utf32_combining_class(c))
+ *t++ = c;
+ }
+ return t - start;
+}
+
+/** @brief Normalize and split a string using a given tailoring */
+static void word_split(struct vector *v,
+ const char *s,
+ unicode_property_tailor *pt) {
+ size_t nw, nt32, i;
+ uint32_t *t32, **w32;
+
+ /* Convert to UTF-32 */
+ if(!(t32 = utf8_to_utf32(s, strlen(s), &nt32)))
+ return;
+ /* Erase case distinctions */
+ if(!(t32 = utf32_casefold_compat(t32, nt32, &nt32)))
+ return;
+ /* Drop combining characters */
+ nt32 = remove_combining_chars(t32, nt32);
+ /* Split into words, treating _ as a space */
+ w32 = utf32_word_split(t32, nt32, &nw, pt);
+ /* Convert words back to UTF-8 and append to result */
+ for(i = 0; i < nw; ++i)
+ vector_append(v, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
+}
+
+/** @brief Normalize a tag
+ * @param s Tag
+ * @param ns Length of tag
+ * @return Normalized string or NULL on error
+ *
+ * The return value will be:
+ * - case-folded
+ * - have no leading or trailing space
+ * - have no combining characters
+ * - all spacing between words will be a single U+0020 SPACE
+ */
+static char *normalize_tag(const char *s, size_t ns) {
+ uint32_t *s32, **w32;
+ size_t ns32, nw32, i;
+ struct dynstr d[1];
+
+ if(!(s32 = utf8_to_utf32(s, ns, &ns32)))
+ return 0;
+ if(!(s32 = utf32_casefold_compat(s32, ns32, &ns32))) /* ->NFKD */
+ return 0;
+ ns32 = remove_combining_chars(s32, ns32);
+ /* Split into words, no Word_Break tailoring */
+ w32 = utf32_word_split(s32, ns32, &nw32, 0);
+ /* Compose back into a string */
+ dynstr_init(d);
+ for(i = 0; i < nw32; ++i) {
+ if(i)
+ dynstr_append(d, ' ');
+ dynstr_append_string(d, utf32_to_utf8(w32[i], utf32_len(w32[i]), 0));
+ }
+ dynstr_terminate(d);
+ return d->vec;
+}
+