mdw@git.distorted.org.uk Git - disorder/blob - lib/words.c

   1 /*
   2  * This file is part of DisOrder
   3  * Copyright (C) 2004, 2007 Richard Kettlewell
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  18  * USA
  19  */
  20
  21 #include <config.h>
  22 #include "types.h"
  23
  24 #include <string.h>
  25 #include <stddef.h>
  26
  27 #include "mem.h"
  28 #include "vector.h"
  29 #include "table.h"
  30 #include "words.h"
  31 #include "utf8.h"
  32 #include "log.h"
  33 #include "charset.h"
  34
  35 #include "unidata.h"
  36
  37 const char *casefold(const char *ptr) {
  38   struct dynstr d;
  39   uint32_t c;
  40   const char *s = ptr;
  41
  42   dynstr_init(&d);
  43   while(*s) {
  44     /* Convert UTF-8 to UCS-32 */
  45     PARSE_UTF8(s, c, return ptr);
  46     /* Normalize */
  47     if(c < UNICODE_NCHARS) {
  48       /* If this a known character, convert it to lower case */
  49       const struct unidata *const ud = &unidata[c / 256][c % 256];
  50       c += ud->lower_offset;
  51     }
  52     /* Convert UCS-4 back to UTF-8 */
  53     one_ucs42utf8(c, &d);
  54   }
  55   dynstr_terminate(&d);
  56   return d.vec;
  57 }
  58
  59 static enum unicode_gc_cat cat(uint32_t c) {
  60   if(c < UNICODE_NCHARS) {
  61     /* If this a known character, convert it to lower case */
  62     const struct unidata *const ud = &unidata[c / 256][c % 256];
  63     return ud->gc;
  64   } else
  65     return unicode_gc_Cn;
  66 }
  67
  68 /* XXX this is a bit kludgy */
  69
  70 char **words(const char *s, int *nvecp) {
  71   struct vector v;
  72   struct dynstr d;
  73   const char *start;
  74   uint32_t c;
  75   int in_word = 0;
  76
  77   vector_init(&v);
  78   while(*s) {
  79     start = s;
  80     PARSE_UTF8(s, c, return 0);
  81     /* special cases first */
  82     switch(c) {
  83     case '/':
  84     case '.':
  85     case '+':
  86     case '&':
  87     case ':':
  88     case '_':
  89     case '-':
  90       goto separator;
  91     }
  92     /* do the rest on category */
  93     switch(cat(c)) {
  94     case unicode_gc_Ll:
  95     case unicode_gc_Lm:
  96     case unicode_gc_Lo:
  97     case unicode_gc_Lt:
  98     case unicode_gc_Lu:
  99     case unicode_gc_Nd:
 100     case unicode_gc_Nl:
 101     case unicode_gc_No:
 102     case unicode_gc_Sc:
 103     case unicode_gc_Sk:
 104     case unicode_gc_Sm:
 105     case unicode_gc_So:
 106       /* letters, digits and symbols are considered to be part of
 107        * words */
 108       if(!in_word) {
 109         dynstr_init(&d);
 110         in_word = 1;
 111       }
 112       dynstr_append_bytes(&d, start, s - start);
 113       break;
 114
 115     case unicode_gc_Cc:
 116     case unicode_gc_Cf:
 117     case unicode_gc_Co:
 118     case unicode_gc_Cs:
 119     case unicode_gc_Zl:
 120     case unicode_gc_Zp:
 121     case unicode_gc_Zs:
 122     case unicode_gc_Pe:
 123     case unicode_gc_Ps:
 124     separator:
 125       if(in_word) {
 126         dynstr_terminate(&d);
 127         vector_append(&v, d.vec);
 128         in_word = 0;
 129       }
 130       break;
 131
 132     case unicode_gc_Mc:
 133     case unicode_gc_Me:
 134     case unicode_gc_Mn:
 135     case unicode_gc_Pc:
 136     case unicode_gc_Pd:
 137     case unicode_gc_Pf:
 138     case unicode_gc_Pi:
 139     case unicode_gc_Po:
 140     case unicode_gc_Cn:
 141       /* control and punctuation is completely ignored */
 142       break;
 143
 144     }
 145   }
 146   if(in_word) {
 147     /* pick up the final word */
 148     dynstr_terminate(&d);
 149     vector_append(&v, d.vec);
 150   }
 151   vector_terminate(&v);
 152   if(nvecp)
 153     *nvecp = v.nvec;
 154   return v.vec;
 155 }
 156
 157 /*
 158 Local Variables:
 159 c-basic-offset:2
 160 comment-column:40
 161 End:
 162 */