mdw@git.distorted.org.uk Git - sgt/halibut/blob - ustring.c

   1 /*
   2  * ustring.c: Unicode string routines
   3  */
   4
   5 #include <wchar.h>
   6 #include <time.h>
   7 #include "halibut.h"
   8
   9 wchar_t *ustrdup(wchar_t const *s) {
  10     wchar_t *r;
  11     if (s) {
  12         r = mknewa(wchar_t, 1+ustrlen(s));
  13         ustrcpy(r, s);
  14     } else {
  15         r = mknew(wchar_t);
  16         *r = 0;
  17     }
  18     return r;
  19 }
  20
  21 static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
  22                               int charset, int careful) {
  23     int len, ret, err;
  24     charset_state state = CHARSET_INIT_STATE;
  25
  26     if (!s) {
  27         *outbuf = '\0';
  28         return outbuf;
  29     }
  30
  31     len = ustrlen(s);
  32     size--;                            /* leave room for terminating NUL */
  33     *outbuf = '\0';
  34     while (len > 0) {
  35         err = 0;
  36         ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
  37                                    (careful ? &err : NULL));
  38         if (err)
  39             return NULL;
  40         if (!ret)
  41             return outbuf;
  42         size -= ret;
  43         outbuf += ret;
  44         *outbuf = '\0';
  45     }
  46     /*
  47      * Clean up
  48      */
  49     ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
  50     size -= ret;
  51     outbuf += ret;
  52     *outbuf = '\0';
  53     return outbuf;
  54 }
  55
  56 char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
  57     return ustrtoa_internal(s, outbuf, size, charset, FALSE);
  58 }
  59
  60 char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
  61     return ustrtoa_internal(s, outbuf, size, charset, TRUE);
  62 }
  63
  64 wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
  65     int len, ret;
  66     charset_state state = CHARSET_INIT_STATE;
  67
  68     if (!s) {
  69         *outbuf = L'\0';
  70         return outbuf;
  71     }
  72
  73     len = strlen(s);
  74     size--;                            /* allow for terminating NUL */
  75     *outbuf = L'\0';
  76     while (len > 0) {
  77         ret = charset_to_unicode(&s, &len, outbuf, size,
  78                                  charset, &state, NULL, 0);
  79         if (!ret)
  80             return outbuf;
  81         outbuf += ret;
  82         size -= ret;
  83         *outbuf = L'\0';
  84     }
  85     return outbuf;
  86 }
  87
  88 char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
  89 {
  90     char *outbuf;
  91     int outpos, outlen, len, ret, err;
  92     charset_state state = CHARSET_INIT_STATE;
  93
  94     if (!s) {
  95         return dupstr("");
  96     }
  97
  98     len = ustrlen(s);
  99
 100     outlen = len + 10;
 101     outbuf = mknewa(char, outlen);
 102
 103     outpos = 0;
 104     outbuf[outpos] = '\0';
 105
 106     while (len > 0) {
 107         err = 0;
 108         ret = charset_from_unicode(&s, &len,
 109                                    outbuf + outpos, outlen - outpos - 1,
 110                                    charset, &state, (careful ? &err : NULL));
 111         if (err) {
 112             sfree(outbuf);
 113             return NULL;
 114         }
 115         if (!ret) {
 116             outlen = outlen * 3 / 2;
 117             outbuf = resize(outbuf, outlen);
 118         }
 119         outpos += ret;
 120         outbuf[outpos] = '\0';
 121     }
 122     /*
 123      * Clean up
 124      */
 125     outlen = outpos + 32;
 126     outbuf = resize(outbuf, outlen);
 127     ret = charset_from_unicode(NULL, 0,
 128                                outbuf + outpos, outlen - outpos + 1,
 129                                charset, &state, NULL);
 130     outpos += ret;
 131     outbuf[outpos] = '\0';
 132     if (lenp)
 133         *lenp = outpos;
 134     return outbuf;
 135 }
 136
 137 char *utoa_dup(wchar_t const *s, int charset)
 138 {
 139     return utoa_internal_dup(s, charset, NULL, FALSE);
 140 }
 141
 142 char *utoa_dup_len(wchar_t const *s, int charset, int *len)
 143 {
 144     return utoa_internal_dup(s, charset, len, FALSE);
 145 }
 146
 147 char *utoa_careful_dup(wchar_t const *s, int charset)
 148 {
 149     return utoa_internal_dup(s, charset, NULL, TRUE);
 150 }
 151
 152 wchar_t *ufroma_dup(char const *s, int charset) {
 153     int len;
 154     wchar_t *buf = NULL;
 155
 156     len = strlen(s) + 1;
 157     do {
 158         buf = resize(buf, len);
 159         ustrfroma(s, buf, len, charset);
 160         len = (3 * len) / 2 + 1;       /* this guarantees a strict increase */
 161     } while (ustrlen(buf) >= len-1);
 162
 163     buf = resize(buf, ustrlen(buf)+1);
 164     return buf;
 165 }
 166
 167 int ustrlen(wchar_t const *s) {
 168     int len = 0;
 169     while (*s++) len++;
 170     return len;
 171 }
 172
 173 wchar_t *uadv(wchar_t *s) {
 174     return s + 1 + ustrlen(s);
 175 }
 176
 177 wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
 178     wchar_t *ret = dest;
 179     do {
 180         *dest++ = *source;
 181     } while (*source++);
 182     return ret;
 183 }
 184
 185 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
 186     if (!lhs && !rhs) return 0;
 187     if (!lhs) return -1;
 188     if (!rhs) return +1;
 189     while (*lhs && *rhs && *lhs==*rhs)
 190         lhs++, rhs++;
 191     if (*lhs < *rhs)
 192         return -1;
 193     else if (*lhs > *rhs)
 194         return 1;
 195     return 0;
 196 }
 197
 198 wchar_t utolower(wchar_t c) {
 199     if (c == L'\0')
 200         return c;                      /* this property needed by ustricmp */
 201     /* FIXME: this doesn't even come close */
 202     if (c >= 'A' && c <= 'Z')
 203         c += 'a'-'A';
 204     return c;
 205 }
 206
 207 int uisalpha(wchar_t c) {
 208     /* FIXME: this doesn't even come close */
 209     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
 210 }
 211
 212 int ustricmp(wchar_t *lhs, wchar_t *rhs) {
 213     wchar_t lc, rc;
 214     while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
 215         lhs++, rhs++;
 216     if (!lc && !rc)
 217         return 0;
 218     if (lc < rc)
 219         return -1;
 220     else
 221         return 1;
 222 }
 223
 224 wchar_t *ustrlow(wchar_t *s) {
 225     wchar_t *p = s;
 226     while (*p) {
 227         *p = utolower(*p);
 228         p++;
 229     }
 230     return s;
 231 }
 232
 233 int utoi(wchar_t *s) {
 234     int sign = +1;
 235     int n;
 236
 237     if (*s == L'-') {
 238         s++;
 239         sign = -1;
 240     }
 241
 242     n = 0;
 243     while (*s && *s >= L'0' && *s <= L'9') {
 244         n *= 10;
 245         n += (*s - '0');
 246         s++;
 247     }
 248
 249     return n;
 250 }
 251
 252 int utob(wchar_t *s) {
 253     if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
 254         !ustricmp(s, L"true") || !ustricmp(s, L"t"))
 255         return TRUE;
 256     return FALSE;
 257 }
 258
 259 int uisdigit(wchar_t c) {
 260     return c >= L'0' && c <= L'9';
 261 }
 262
 263 #define USTRFTIME_DELTA 128
 264 wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) {
 265     void *blk = NULL;
 266     wchar_t *wblk, *wp;
 267     char *fmt, *text, *p;
 268     size_t size = 0;
 269     size_t len;
 270
 271     /*
 272      * FIXME: really we ought to copy non-% parts of the format
 273      * ourselves, and only resort to strftime for % parts. Also we
 274      * should use wcsftime if it's present.
 275      */
 276
 277     /*
 278      * strftime has the entertaining property that it returns 0
 279      * _either_ on out-of-space _or_ on successful generation of
 280      * the empty string. Hence we must ensure our format can never
 281      * generate the empty string. Somebody throw a custard pie at
 282      * whoever was responsible for that. Please?
 283      */
 284     if (wfmt) {
 285         len = ustrlen(wfmt);
 286         fmt = mknewa(char, 2+len);
 287         ustrtoa(wfmt, fmt+1, len+1, CS_ASCII);   /* CS_FIXME? */
 288         fmt[0] = ' ';
 289     } else
 290         fmt = " %c";
 291
 292     while (1) {
 293         size += USTRFTIME_DELTA;
 294         blk = resize((char *)blk, size);
 295         len = strftime((char *)blk, size-1, fmt, timespec);
 296         if (len > 0)
 297             break;
 298     }
 299
 300     /* Note: +1 for the terminating 0, -1 for the initial space in fmt */
 301     wblk = resize((wchar_t *)blk, len);
 302     text = mknewa(char, len);
 303     strftime(text, len, fmt+1, timespec);
 304     /*
 305      * We operate in the C locale, so this all ought to be kosher
 306      * ASCII. If we ever move outside ASCII machines, we may need
 307      * to make this more portable...
 308      */
 309     for (wp = wblk, p = text; *p; p++, wp++)
 310         *wp = *p;
 311     *wp = 0;
 312     if (wfmt)
 313         sfree(fmt);
 314     sfree(text);
 315     return wblk;
 316 }
 317
 318 /*
 319  * Determine whether a Unicode string can be translated into a
 320  * given charset without any missing characters.
 321  */
 322 int cvt_ok(int charset, const wchar_t *s)
 323 {
 324     char buf[256];
 325     charset_state state = CHARSET_INIT_STATE;
 326     int err, len = ustrlen(s);
 327
 328     err = 0;
 329     while (len > 0) {
 330         (void)charset_from_unicode(&s, &len, buf, lenof(buf),
 331                                    charset, &state, &err);
 332         if (err)
 333             return FALSE;
 334     }
 335     return TRUE;
 336 }