X-Git-Url: https://git.distorted.org.uk/~mdw/disorder/blobdiff_plain/460b9539a7c15580e41a71bbc0f47ae776238915..0ec4c5e69dc74bbe543506cd3bbd812d686fb4fd:/lib/words.c

diff --git a/lib/words.c b/lib/words.c
index 2e4001d..e6e4087 100644
--- a/lib/words.c
+++ b/lib/words.c
@@ -1,6 +1,6 @@
 /*
  * This file is part of DisOrder
- * Copyright (C) 2004 Richard Kettlewell
+ * Copyright (C) 2004, 2007 Richard Kettlewell
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -29,54 +29,40 @@
 #include "table.h"
 #include "words.h"
 #include "utf8.h"
+#include "log.h"
+#include "charset.h"
 
-#include "casefold.h"
-#include "unicodegc.h"
+#include "unidata.h"
 
 const char *casefold(const char *ptr) {
   struct dynstr d;
-  int l, r, m;
   uint32_t c;
-  const struct cm *t;
-  const char *start, *s = ptr;
+  const char *s = ptr;
 
   dynstr_init(&d);
   while(*s) {
-    start = s;
+    /* Convert UTF-8 to UCS-32 */
     PARSE_UTF8(s, c, return ptr);
-    /* seek the folded equivalent */
-    t = cm[c & CM_MASK];
-    l = 0;
-    r = cmn[c & CM_MASK] - 1;
-    while(l <= r && c != t[m = (l + r) / 2].ch)
-      if(c < t[m].ch)
-	r = m - 1;
-      else
-	l = m + 1;
-    if(l <= r)
-      dynstr_append_string(&d, t[m].tr);
-    else
-      dynstr_append_bytes(&d, start, s - start);
+    /* Normalize */
+    if(c < UNICODE_NCHARS) {
+      /* If this a known character, convert it to lower case */
+      const struct unidata *const ud = &unidata[c / 256][c % 256];
+      c += ud->lower_offset;
+    }
+    /* Convert UCS-4 back to UTF-8 */
+    one_ucs42utf8(c, &d);
   }
   dynstr_terminate(&d);
   return d.vec;
 }
 
 static enum unicode_gc_cat cat(uint32_t c) {
-  int l, r, m;
-
-  l = 0;
-  r = sizeof gcs / sizeof *gcs;
-  while(l <= r) {
-    m = (l + r) / 2;
-    if(c < gcs[m].l)
-      r = m - 1;
-    else if(c > gcs[m].h)
-      l = m + 1;
-    else
-      return gcs[m].cat;
-  }
-  return unicode_gc_none;
+  if(c < UNICODE_NCHARS) {
+    /* If this a known character, convert it to lower case */
+    const struct unidata *const ud = &unidata[c / 256][c % 256];
+    return ud->gc;
+  } else
+    return unicode_gc_Cn;
 }
 
 /* XXX this is a bit kludgy */
@@ -151,7 +137,7 @@ char **words(const char *s, int *nvecp) {
     case unicode_gc_Pf:
     case unicode_gc_Pi:
     case unicode_gc_Po:
-    case unicode_gc_none:
+    case unicode_gc_Cn:
       /* control and punctuation is completely ignored */
       break;
 
@@ -174,4 +160,3 @@ c-basic-offset:2
 comment-column:40
 End:
 */
-/* arch-tag:0ea1f1700f14cd031b7f1fbbcca765fa */