mdw@git.distorted.org.uk Git - sgt/halibut/blob - ustring.c

   1 /*
   2  * ustring.c: Unicode string routines
   3  */
   4
   5 #include <wchar.h>
   6 #include <stdlib.h>
   7 #include <assert.h>
   8 #include <time.h>
   9 #include "halibut.h"
  10
  11 wchar_t *ustrdup(wchar_t const *s) {
  12     wchar_t *r;
  13     if (s) {
  14         r = mknewa(wchar_t, 1+ustrlen(s));
  15         ustrcpy(r, s);
  16     } else {
  17         r = mknew(wchar_t);
  18         *r = 0;
  19     }
  20     return r;
  21 }
  22
  23 static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
  24                               int charset, int careful) {
  25     int len, ret, err;
  26     charset_state state = CHARSET_INIT_STATE;
  27
  28     if (!s) {
  29         *outbuf = '\0';
  30         return outbuf;
  31     }
  32
  33     len = ustrlen(s);
  34     size--;                            /* leave room for terminating NUL */
  35     *outbuf = '\0';
  36     while (len > 0) {
  37         err = 0;
  38         ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
  39                                    (careful ? &err : NULL));
  40         if (err)
  41             return NULL;
  42         if (!ret)
  43             return outbuf;
  44         size -= ret;
  45         outbuf += ret;
  46         *outbuf = '\0';
  47     }
  48     /*
  49      * Clean up
  50      */
  51     ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
  52     size -= ret;
  53     outbuf += ret;
  54     *outbuf = '\0';
  55     return outbuf;
  56 }
  57
  58 char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
  59     return ustrtoa_internal(s, outbuf, size, charset, FALSE);
  60 }
  61
  62 char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
  63     return ustrtoa_internal(s, outbuf, size, charset, TRUE);
  64 }
  65
  66 wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
  67     int len, ret;
  68     charset_state state = CHARSET_INIT_STATE;
  69
  70     if (!s) {
  71         *outbuf = L'\0';
  72         return outbuf;
  73     }
  74
  75     len = strlen(s);
  76     size--;                            /* allow for terminating NUL */
  77     *outbuf = L'\0';
  78     while (len > 0) {
  79         ret = charset_to_unicode(&s, &len, outbuf, size,
  80                                  charset, &state, NULL, 0);
  81         if (!ret)
  82             return outbuf;
  83         outbuf += ret;
  84         size -= ret;
  85         *outbuf = L'\0';
  86     }
  87     return outbuf;
  88 }
  89
  90 char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
  91 {
  92     char *outbuf;
  93     int outpos, outlen, len, ret, err;
  94     charset_state state = CHARSET_INIT_STATE;
  95
  96     if (!s) {
  97         return dupstr("");
  98     }
  99
 100     len = ustrlen(s);
 101
 102     outlen = len + 10;
 103     outbuf = mknewa(char, outlen);
 104
 105     outpos = 0;
 106     outbuf[outpos] = '\0';
 107
 108     while (len > 0) {
 109         err = 0;
 110         ret = charset_from_unicode(&s, &len,
 111                                    outbuf + outpos, outlen - outpos - 1,
 112                                    charset, &state, (careful ? &err : NULL));
 113         if (err) {
 114             sfree(outbuf);
 115             return NULL;
 116         }
 117         if (!ret) {
 118             outlen = outlen * 3 / 2;
 119             outbuf = resize(outbuf, outlen);
 120         }
 121         outpos += ret;
 122         outbuf[outpos] = '\0';
 123     }
 124     /*
 125      * Clean up
 126      */
 127     outlen = outpos + 32;
 128     outbuf = resize(outbuf, outlen);
 129     ret = charset_from_unicode(NULL, 0,
 130                                outbuf + outpos, outlen - outpos + 1,
 131                                charset, &state, NULL);
 132     outpos += ret;
 133     outbuf[outpos] = '\0';
 134     if (lenp)
 135         *lenp = outpos;
 136     return outbuf;
 137 }
 138
 139 char *utoa_dup(wchar_t const *s, int charset)
 140 {
 141     return utoa_internal_dup(s, charset, NULL, FALSE);
 142 }
 143
 144 char *utoa_dup_len(wchar_t const *s, int charset, int *len)
 145 {
 146     return utoa_internal_dup(s, charset, len, FALSE);
 147 }
 148
 149 char *utoa_careful_dup(wchar_t const *s, int charset)
 150 {
 151     return utoa_internal_dup(s, charset, NULL, TRUE);
 152 }
 153
 154 wchar_t *ufroma_dup(char const *s, int charset) {
 155     int len;
 156     wchar_t *buf = NULL;
 157
 158     len = strlen(s) + 1;
 159     do {
 160         buf = resize(buf, len);
 161         ustrfroma(s, buf, len, charset);
 162         len = (3 * len) / 2 + 1;       /* this guarantees a strict increase */
 163     } while (ustrlen(buf) >= len-1);
 164
 165     buf = resize(buf, ustrlen(buf)+1);
 166     return buf;
 167 }
 168
 169 char *utoa_locale_dup(wchar_t const *s)
 170 {
 171     /*
 172      * This variant uses the C library locale.
 173      */
 174     char *ret;
 175     int len;
 176     size_t siz;
 177
 178     len = ustrlen(s);
 179
 180     ret = mknewa(char, 1 + MB_CUR_MAX * len);
 181
 182     siz = wcstombs(ret, s, len);
 183
 184     if (siz) {
 185         assert(siz <= MB_CUR_MAX * len);
 186         ret[siz] = '\0';
 187         ret = resize(ret, siz+1);
 188         return ret;
 189     }
 190
 191     /*
 192      * If that failed, try a different strategy (which we will also
 193      * attempt in the total absence of wcstombs). Retrieve the
 194      * locale's charset from nl_langinfo or equivalent, and use
 195      * normal utoa_dup.
 196      */
 197     return utoa_dup(s, charset_from_locale());
 198 }
 199
 200 wchar_t *ufroma_locale_dup(char const *s)
 201 {
 202     /*
 203      * This variant uses the C library locale.
 204      */
 205     wchar_t *ret;
 206     int len;
 207     size_t siz;
 208
 209     len = strlen(s);
 210
 211     ret = mknewa(wchar_t, 1 + 2*len);  /* be conservative */
 212
 213     siz = mbstowcs(ret, s, len);
 214
 215     if (siz) {
 216         assert(siz <= (size_t)(2 * len));
 217         ret[siz] = L'\0';
 218         ret = resize(ret, siz+1);
 219         return ret;
 220     }
 221
 222     /*
 223      * If that failed, try a different strategy (which we will also
 224      * attempt in the total absence of wcstombs). Retrieve the
 225      * locale's charset from nl_langinfo or equivalent, and use
 226      * normal ufroma_dup.
 227      */
 228     return ufroma_dup(s, charset_from_locale());
 229 }
 230
 231 int ustrlen(wchar_t const *s) {
 232     int len = 0;
 233     while (*s++) len++;
 234     return len;
 235 }
 236
 237 wchar_t *uadv(wchar_t *s) {
 238     return s + 1 + ustrlen(s);
 239 }
 240
 241 wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
 242     wchar_t *ret = dest;
 243     do {
 244         *dest++ = *source;
 245     } while (*source++);
 246     return ret;
 247 }
 248
 249 wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) {
 250     wchar_t *ret = dest;
 251     do {
 252         *dest++ = *source;
 253         if (*source) source++;
 254     } while (n-- > 0);
 255     return ret;
 256 }
 257
 258 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
 259     if (!lhs && !rhs) return 0;
 260     if (!lhs) return -1;
 261     if (!rhs) return +1;
 262     while (*lhs && *rhs && *lhs==*rhs)
 263         lhs++, rhs++;
 264     if (*lhs < *rhs)
 265         return -1;
 266     else if (*lhs > *rhs)
 267         return 1;
 268     return 0;
 269 }
 270
 271 wchar_t utolower(wchar_t c) {
 272     if (c == L'\0')
 273         return c;                      /* this property needed by ustricmp */
 274 #ifdef HAS_TOWLOWER
 275     return towlower(c);
 276 #else
 277     if (c >= 'A' && c <= 'Z')
 278         c += 'a'-'A';
 279     return c;
 280 #endif
 281 }
 282
 283 int uisalpha(wchar_t c) {
 284 #ifdef HAS_ISWALPHA
 285     return iswalpha(c);
 286 #else
 287     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
 288 #endif
 289 }
 290
 291 int ustricmp(wchar_t const *lhs, wchar_t const *rhs) {
 292     wchar_t lc, rc;
 293     while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
 294         lhs++, rhs++;
 295     if (!lc && !rc)
 296         return 0;
 297     if (lc < rc)
 298         return -1;
 299     else
 300         return 1;
 301 }
 302
 303 int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) {
 304     wchar_t lc = 0, rc = 0;
 305     while (maxlen-- > 0 &&
 306            (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
 307         lhs++, rhs++;
 308     if (lc < rc)
 309         return -1;
 310     else if (lc > rc)
 311         return 1;
 312     else
 313         return 0;
 314 }
 315
 316 wchar_t *ustrlow(wchar_t *s) {
 317     wchar_t *p = s;
 318     while (*p) {
 319         *p = utolower(*p);
 320         p++;
 321     }
 322     return s;
 323 }
 324
 325 int utoi(wchar_t const *s) {
 326     int sign = +1;
 327     int n;
 328
 329     if (*s == L'-') {
 330         s++;
 331         sign = -1;
 332     }
 333
 334     n = 0;
 335     while (*s && *s >= L'0' && *s <= L'9') {
 336         n *= 10;
 337         n += (*s - '0');
 338         s++;
 339     }
 340
 341     return n;
 342 }
 343
 344 double utof(wchar_t const *s)
 345 {
 346     char *cs = utoa_dup(s, CS_ASCII);
 347     double ret = atof(cs);
 348     sfree(cs);
 349     return ret;
 350 }
 351
 352 int utob(wchar_t const *s) {
 353     if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
 354         !ustricmp(s, L"true") || !ustricmp(s, L"t"))
 355         return TRUE;
 356     return FALSE;
 357 }
 358
 359 int uisdigit(wchar_t c) {
 360     return c >= L'0' && c <= L'9';
 361 }
 362
 363 #define USTRFTIME_DELTA 128
 364 static void ustrftime_internal(rdstring *rs, char formatchr,
 365                                const struct tm *timespec)
 366 {
 367     /*
 368      * strftime has the entertaining property that it returns 0
 369      * _either_ on out-of-space _or_ on successful generation of
 370      * the empty string. Hence we must ensure our format can never
 371      * generate the empty string. Somebody throw a custard pie at
 372      * whoever was responsible for that. Please?
 373      */
 374
 375 #ifdef HAS_WCSFTIME
 376     wchar_t *buf = NULL;
 377     wchar_t fmt[4];
 378     int size, ret;
 379
 380     fmt[0] = L' ';
 381     fmt[1] = L'%';
 382     /* Format chars are all ASCII, so conversion to Unicode is no problem */
 383     fmt[2] = formatchr;
 384     fmt[3] = L'\0';
 385
 386     size = 0;
 387     do {
 388         size += USTRFTIME_DELTA;
 389         buf = resize(buf, size);
 390         ret = (int) wcsftime(buf, size, fmt, timespec);
 391     } while (ret == 0);
 392
 393     rdadds(rs, buf+1);
 394     sfree(buf);
 395 #else
 396     char *buf = NULL;
 397     wchar_t *cvtbuf;
 398     char fmt[4];
 399     int size, ret;
 400
 401     fmt[0] = ' ';
 402     fmt[1] = '%';
 403     fmt[2] = formatchr;
 404     fmt[3] = '\0';
 405
 406     size = 0;
 407     do {
 408         size += USTRFTIME_DELTA;
 409         buf = resize(buf, size);
 410         ret = (int) strftime(buf, size, fmt, timespec);
 411     } while (ret == 0);
 412
 413     cvtbuf = ufroma_locale_dup(buf+1);
 414     rdadds(rs, cvtbuf);
 415     sfree(cvtbuf);
 416     sfree(buf);
 417 #endif
 418 }
 419
 420 wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec)
 421 {
 422     rdstring rs = { 0, 0, NULL };
 423
 424     if (!wfmt)
 425         wfmt = L"%c";
 426
 427     while (*wfmt) {
 428         if (wfmt[0] == L'%' && wfmt[1] == L'%') {
 429             rdadd(&rs, L'%');
 430             wfmt += 2;
 431         } else if (wfmt[0] == L'%' && wfmt[1]) {
 432             ustrftime_internal(&rs, wfmt[1], timespec);
 433             wfmt += 2;
 434         } else {
 435             rdadd(&rs, wfmt[0]);
 436             wfmt++;
 437         }
 438     }
 439
 440     return rdtrim(&rs);
 441 }
 442
 443 /*
 444  * Determine whether a Unicode string can be translated into a
 445  * given charset without any missing characters.
 446  */
 447 int cvt_ok(int charset, const wchar_t *s)
 448 {
 449     char buf[256];
 450     charset_state state = CHARSET_INIT_STATE;
 451     int err, len = ustrlen(s);
 452
 453     err = 0;
 454     while (len > 0) {
 455         (void)charset_from_unicode(&s, &len, buf, lenof(buf),
 456                                    charset, &state, &err);
 457         if (err)
 458             return FALSE;
 459     }
 460     return TRUE;
 461 }