X-Git-Url: https://git.distorted.org.uk/~mdw/disorder/blobdiff_plain/460b9539a7c15580e41a71bbc0f47ae776238915..5617aaff51ba333441230e3808bc697e66540492:/lib/words.c diff --git a/lib/words.c b/lib/words.c index 2e4001d..2638ea6 100644 --- a/lib/words.c +++ b/lib/words.c @@ -1,6 +1,6 @@ /* * This file is part of DisOrder - * Copyright (C) 2004 Richard Kettlewell + * Copyright (C) 2004, 2007 Richard Kettlewell * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -29,54 +29,22 @@ #include "table.h" #include "words.h" #include "utf8.h" +#include "log.h" +#include "charset.h" -#include "casefold.h" -#include "unicodegc.h" +#include "unidata.h" +#include "unicode.h" const char *casefold(const char *ptr) { - struct dynstr d; - int l, r, m; - uint32_t c; - const struct cm *t; - const char *start, *s = ptr; - - dynstr_init(&d); - while(*s) { - start = s; - PARSE_UTF8(s, c, return ptr); - /* seek the folded equivalent */ - t = cm[c & CM_MASK]; - l = 0; - r = cmn[c & CM_MASK] - 1; - while(l <= r && c != t[m = (l + r) / 2].ch) - if(c < t[m].ch) - r = m - 1; - else - l = m + 1; - if(l <= r) - dynstr_append_string(&d, t[m].tr); - else - dynstr_append_bytes(&d, start, s - start); - } - dynstr_terminate(&d); - return d.vec; + return utf8_casefold_canon(ptr, strlen(ptr), 0); } -static enum unicode_gc_cat cat(uint32_t c) { - int l, r, m; - - l = 0; - r = sizeof gcs / sizeof *gcs; - while(l <= r) { - m = (l + r) / 2; - if(c < gcs[m].l) - r = m - 1; - else if(c > gcs[m].h) - l = m + 1; - else - return gcs[m].cat; - } - return unicode_gc_none; +static enum unicode_General_Category cat(uint32_t c) { + if(c < UNICODE_NCHARS) { + const struct unidata *const ud = &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS]; + return ud->general_category; + } else + return unicode_General_Category_Cn; } /* XXX this is a bit kludgy */ @@ -105,18 +73,18 @@ char **words(const char *s, int *nvecp) { } /* do the rest on category */ switch(cat(c)) { - case unicode_gc_Ll: - case unicode_gc_Lm: - case unicode_gc_Lo: - case unicode_gc_Lt: - case unicode_gc_Lu: - case unicode_gc_Nd: - case unicode_gc_Nl: - case unicode_gc_No: - case unicode_gc_Sc: - case unicode_gc_Sk: - case unicode_gc_Sm: - case unicode_gc_So: + case unicode_General_Category_Ll: + case unicode_General_Category_Lm: + case unicode_General_Category_Lo: + case unicode_General_Category_Lt: + case unicode_General_Category_Lu: + case unicode_General_Category_Nd: + case unicode_General_Category_Nl: + case unicode_General_Category_No: + case unicode_General_Category_Sc: + case unicode_General_Category_Sk: + case unicode_General_Category_Sm: + case unicode_General_Category_So: /* letters, digits and symbols are considered to be part of * words */ if(!in_word) { @@ -126,15 +94,15 @@ char **words(const char *s, int *nvecp) { dynstr_append_bytes(&d, start, s - start); break; - case unicode_gc_Cc: - case unicode_gc_Cf: - case unicode_gc_Co: - case unicode_gc_Cs: - case unicode_gc_Zl: - case unicode_gc_Zp: - case unicode_gc_Zs: - case unicode_gc_Pe: - case unicode_gc_Ps: + case unicode_General_Category_Cc: + case unicode_General_Category_Cf: + case unicode_General_Category_Co: + case unicode_General_Category_Cs: + case unicode_General_Category_Zl: + case unicode_General_Category_Zp: + case unicode_General_Category_Zs: + case unicode_General_Category_Pe: + case unicode_General_Category_Ps: separator: if(in_word) { dynstr_terminate(&d); @@ -143,15 +111,15 @@ char **words(const char *s, int *nvecp) { } break; - case unicode_gc_Mc: - case unicode_gc_Me: - case unicode_gc_Mn: - case unicode_gc_Pc: - case unicode_gc_Pd: - case unicode_gc_Pf: - case unicode_gc_Pi: - case unicode_gc_Po: - case unicode_gc_none: + case unicode_General_Category_Mc: + case unicode_General_Category_Me: + case unicode_General_Category_Mn: + case unicode_General_Category_Pc: + case unicode_General_Category_Pd: + case unicode_General_Category_Pf: + case unicode_General_Category_Pi: + case unicode_General_Category_Po: + case unicode_General_Category_Cn: /* control and punctuation is completely ignored */ break; @@ -174,4 +142,3 @@ c-basic-offset:2 comment-column:40 End: */ -/* arch-tag:0ea1f1700f14cd031b7f1fbbcca765fa */