mdw@git.distorted.org.uk Git - sgt/halibut/blob - ustring.c

   1 /*
   2  * ustring.c: Unicode string routines
   3  */
   4
   5 #include <wchar.h>
   6 #include <stdlib.h>
   7 #include <assert.h>
   8 #include <time.h>
   9 #include "halibut.h"
  10
  11 wchar_t *ustrdup(wchar_t const *s) {
  12     wchar_t *r;
  13     if (s) {
  14         r = snewn(1+ustrlen(s), wchar_t);
  15         ustrcpy(r, s);
  16     } else {
  17         r = snew(wchar_t);
  18         *r = 0;
  19     }
  20     return r;
  21 }
  22
  23 static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
  24                               int charset, int careful) {
  25     int len, ret, err;
  26     charset_state state = CHARSET_INIT_STATE;
  27
  28     if (!s) {
  29         *outbuf = '\0';
  30         return outbuf;
  31     }
  32
  33     len = ustrlen(s);
  34     size--;                            /* leave room for terminating NUL */
  35     *outbuf = '\0';
  36     while (len > 0) {
  37         err = 0;
  38         ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
  39                                    (careful ? &err : NULL));
  40         if (err)
  41             return NULL;
  42         if (!ret)
  43             return outbuf;
  44         size -= ret;
  45         outbuf += ret;
  46         *outbuf = '\0';
  47     }
  48     /*
  49      * Clean up
  50      */
  51     ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
  52     size -= ret;
  53     outbuf += ret;
  54     *outbuf = '\0';
  55     return outbuf;
  56 }
  57
  58 char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
  59     return ustrtoa_internal(s, outbuf, size, charset, FALSE);
  60 }
  61
  62 char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
  63     return ustrtoa_internal(s, outbuf, size, charset, TRUE);
  64 }
  65
  66 wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
  67     int len, ret;
  68     charset_state state = CHARSET_INIT_STATE;
  69
  70     if (!s) {
  71         *outbuf = L'\0';
  72         return outbuf;
  73     }
  74
  75     len = strlen(s);
  76     size--;                            /* allow for terminating NUL */
  77     *outbuf = L'\0';
  78     while (len > 0) {
  79         ret = charset_to_unicode(&s, &len, outbuf, size,
  80                                  charset, &state, NULL, 0);
  81         if (!ret)
  82             return outbuf;
  83         outbuf += ret;
  84         size -= ret;
  85         *outbuf = L'\0';
  86     }
  87     return outbuf;
  88 }
  89
  90 char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
  91 {
  92     char *outbuf;
  93     int outpos, outlen, len, ret, err;
  94     charset_state state = CHARSET_INIT_STATE;
  95
  96     if (!s) {
  97         return dupstr("");
  98     }
  99
 100     len = ustrlen(s);
 101
 102     outlen = len + 10;
 103     outbuf = snewn(outlen, char);
 104
 105     outpos = 0;
 106     outbuf[outpos] = '\0';
 107
 108     while (len > 0) {
 109         err = 0;
 110         ret = charset_from_unicode(&s, &len,
 111                                    outbuf + outpos, outlen - outpos - 1,
 112                                    charset, &state, (careful ? &err : NULL));
 113         if (err) {
 114             sfree(outbuf);
 115             return NULL;
 116         }
 117         if (!ret) {
 118             outlen = outlen * 3 / 2;
 119             outbuf = sresize(outbuf, outlen, char);
 120         }
 121         outpos += ret;
 122         outbuf[outpos] = '\0';
 123     }
 124     /*
 125      * Clean up
 126      */
 127     outlen = outpos + 32;
 128     outbuf = sresize(outbuf, outlen, char);
 129     ret = charset_from_unicode(NULL, 0,
 130                                outbuf + outpos, outlen - outpos + 1,
 131                                charset, &state, NULL);
 132     outpos += ret;
 133     outbuf[outpos] = '\0';
 134     if (lenp)
 135         *lenp = outpos;
 136     return outbuf;
 137 }
 138
 139 char *utoa_dup(wchar_t const *s, int charset)
 140 {
 141     return utoa_internal_dup(s, charset, NULL, FALSE);
 142 }
 143
 144 char *utoa_dup_len(wchar_t const *s, int charset, int *len)
 145 {
 146     return utoa_internal_dup(s, charset, len, FALSE);
 147 }
 148
 149 char *utoa_careful_dup(wchar_t const *s, int charset)
 150 {
 151     return utoa_internal_dup(s, charset, NULL, TRUE);
 152 }
 153
 154 wchar_t *ufroma_dup(char const *s, int charset) {
 155     int len;
 156     wchar_t *buf = NULL;
 157
 158     len = strlen(s) + 1;
 159     do {
 160         buf = sresize(buf, len, wchar_t);
 161         ustrfroma(s, buf, len, charset);
 162         len = (3 * len) / 2 + 1;       /* this guarantees a strict increase */
 163     } while (ustrlen(buf) >= len-1);
 164
 165     buf = sresize(buf, ustrlen(buf)+1, wchar_t);
 166     return buf;
 167 }
 168
 169 char *utoa_locale_dup(wchar_t const *s)
 170 {
 171     /*
 172      * This variant uses the C library locale.
 173      */
 174     char *ret;
 175     int len, outlen;
 176     size_t siz;
 177
 178     len = ustrlen(s);
 179
 180     outlen = 1 + MB_CUR_MAX * len;
 181     ret = snewn(outlen+1, char);
 182
 183     siz = wcstombs(ret, s, outlen);
 184
 185     if (siz) {
 186         assert(siz <= (size_t)(outlen));
 187         ret[siz] = '\0';
 188         ret = sresize(ret, siz+1, char);
 189         return ret;
 190     }
 191
 192     /*
 193      * If that failed, try a different strategy (which we will also
 194      * attempt in the total absence of wcstombs). Retrieve the
 195      * locale's charset from nl_langinfo or equivalent, and use
 196      * normal utoa_dup.
 197      */
 198     return utoa_dup(s, charset_from_locale());
 199 }
 200
 201 wchar_t *ufroma_locale_dup(char const *s)
 202 {
 203     /*
 204      * This variant uses the C library locale.
 205      */
 206     wchar_t *ret;
 207     int len, outlen;
 208     size_t siz;
 209
 210     len = strlen(s);
 211
 212     outlen = 1 + 2*len;
 213     ret = snewn(outlen+1, wchar_t);  /* be conservative */
 214
 215     siz = mbstowcs(ret, s, outlen);
 216
 217     if (siz) {
 218         assert(siz <= (size_t)(outlen));
 219         ret[siz] = L'\0';
 220         ret = sresize(ret, siz+1, wchar_t);
 221         return ret;
 222     }
 223
 224     /*
 225      * If that failed, try a different strategy (which we will also
 226      * attempt in the total absence of wcstombs). Retrieve the
 227      * locale's charset from nl_langinfo or equivalent, and use
 228      * normal ufroma_dup.
 229      */
 230     return ufroma_dup(s, charset_from_locale());
 231 }
 232
 233 int ustrlen(wchar_t const *s) {
 234     int len = 0;
 235     while (*s++) len++;
 236     return len;
 237 }
 238
 239 wchar_t *uadv(wchar_t *s) {
 240     return s + 1 + ustrlen(s);
 241 }
 242
 243 wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
 244     wchar_t *ret = dest;
 245     do {
 246         *dest++ = *source;
 247     } while (*source++);
 248     return ret;
 249 }
 250
 251 wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) {
 252     wchar_t *ret = dest;
 253     do {
 254         *dest++ = *source;
 255         if (*source) source++;
 256     } while (n-- > 0);
 257     return ret;
 258 }
 259
 260 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
 261     if (!lhs && !rhs) return 0;
 262     if (!lhs) return -1;
 263     if (!rhs) return +1;
 264     while (*lhs && *rhs && *lhs==*rhs)
 265         lhs++, rhs++;
 266     if (*lhs < *rhs)
 267         return -1;
 268     else if (*lhs > *rhs)
 269         return 1;
 270     return 0;
 271 }
 272
 273 wchar_t utolower(wchar_t c) {
 274     if (c == L'\0')
 275         return c;                      /* this property needed by ustricmp */
 276 #ifdef HAS_TOWLOWER
 277     return towlower(c);
 278 #else
 279     if (c >= 'A' && c <= 'Z')
 280         c += 'a'-'A';
 281     return c;
 282 #endif
 283 }
 284
 285 int uisalpha(wchar_t c) {
 286 #ifdef HAS_ISWALPHA
 287     return iswalpha(c);
 288 #else
 289     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
 290 #endif
 291 }
 292
 293 int ustricmp(wchar_t const *lhs, wchar_t const *rhs) {
 294     wchar_t lc, rc;
 295     while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
 296         lhs++, rhs++;
 297     if (!lc && !rc)
 298         return 0;
 299     if (lc < rc)
 300         return -1;
 301     else
 302         return 1;
 303 }
 304
 305 int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) {
 306     wchar_t lc = 0, rc = 0;
 307     while (maxlen-- > 0 &&
 308            (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
 309         lhs++, rhs++;
 310     if (lc < rc)
 311         return -1;
 312     else if (lc > rc)
 313         return 1;
 314     else
 315         return 0;
 316 }
 317
 318 wchar_t *ustrlow(wchar_t *s) {
 319     wchar_t *p = s;
 320     while (*p) {
 321         *p = utolower(*p);
 322         p++;
 323     }
 324     return s;
 325 }
 326
 327 int utoi(wchar_t const *s) {
 328     int sign = +1;
 329     int n;
 330
 331     if (*s == L'-') {
 332         s++;
 333         sign = -1;
 334     }
 335
 336     n = 0;
 337     while (*s && *s >= L'0' && *s <= L'9') {
 338         n *= 10;
 339         n += (*s - '0');
 340         s++;
 341     }
 342
 343     return n * sign;
 344 }
 345
 346 double utof(wchar_t const *s)
 347 {
 348     char *cs = utoa_dup(s, CS_ASCII);
 349     double ret = atof(cs);
 350     sfree(cs);
 351     return ret;
 352 }
 353
 354 int utob(wchar_t const *s) {
 355     if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
 356         !ustricmp(s, L"true") || !ustricmp(s, L"t"))
 357         return TRUE;
 358     return FALSE;
 359 }
 360
 361 int uisdigit(wchar_t c) {
 362     return c >= L'0' && c <= L'9';
 363 }
 364
 365 #define USTRFTIME_DELTA 128
 366 static void ustrftime_internal(rdstring *rs, char formatchr,
 367                                const struct tm *timespec)
 368 {
 369     /*
 370      * strftime has the entertaining property that it returns 0
 371      * _either_ on out-of-space _or_ on successful generation of
 372      * the empty string. Hence we must ensure our format can never
 373      * generate the empty string. Somebody throw a custard pie at
 374      * whoever was responsible for that. Please?
 375      */
 376
 377 #ifdef HAS_WCSFTIME
 378     wchar_t *buf = NULL;
 379     wchar_t fmt[4];
 380     int size, ret;
 381
 382     fmt[0] = L' ';
 383     fmt[1] = L'%';
 384     /* Format chars are all ASCII, so conversion to Unicode is no problem */
 385     fmt[2] = formatchr;
 386     fmt[3] = L'\0';
 387
 388     size = 0;
 389     do {
 390         size += USTRFTIME_DELTA;
 391         buf = sresize(buf, size, wchar_t);
 392         ret = (int) wcsftime(buf, size, fmt, timespec);
 393     } while (ret == 0);
 394
 395     rdadds(rs, buf+1);
 396     sfree(buf);
 397 #else
 398     char *buf = NULL;
 399     wchar_t *cvtbuf;
 400     char fmt[4];
 401     int size, ret;
 402
 403     fmt[0] = ' ';
 404     fmt[1] = '%';
 405     fmt[2] = formatchr;
 406     fmt[3] = '\0';
 407
 408     size = 0;
 409     do {
 410         size += USTRFTIME_DELTA;
 411         buf = sresize(buf, size, char);
 412         ret = (int) strftime(buf, size, fmt, timespec);
 413     } while (ret == 0);
 414
 415     cvtbuf = ufroma_locale_dup(buf+1);
 416     rdadds(rs, cvtbuf);
 417     sfree(cvtbuf);
 418     sfree(buf);
 419 #endif
 420 }
 421
 422 wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec)
 423 {
 424     rdstring rs = { 0, 0, NULL };
 425
 426     if (!wfmt)
 427         wfmt = L"%c";
 428
 429     while (*wfmt) {
 430         if (wfmt[0] == L'%' && wfmt[1] == L'%') {
 431             rdadd(&rs, L'%');
 432             wfmt += 2;
 433         } else if (wfmt[0] == L'%' && wfmt[1]) {
 434             ustrftime_internal(&rs, wfmt[1], timespec);
 435             wfmt += 2;
 436         } else {
 437             rdadd(&rs, wfmt[0]);
 438             wfmt++;
 439         }
 440     }
 441
 442     return rdtrim(&rs);
 443 }
 444
 445 /*
 446  * Determine whether a Unicode string can be translated into a
 447  * given charset without any missing characters.
 448  */
 449 int cvt_ok(int charset, const wchar_t *s)
 450 {
 451     char buf[256];
 452     charset_state state = CHARSET_INIT_STATE;
 453     int err, len = ustrlen(s);
 454
 455     err = 0;
 456     while (len > 0) {
 457         (void)charset_from_unicode(&s, &len, buf, lenof(buf),
 458                                    charset, &state, &err);
 459         if (err)
 460             return FALSE;
 461     }
 462     return TRUE;
 463 }
 464
 465 /*
 466  * Wrapper around charset_from_localenc which accepts the charset
 467  * name as a wide string (since that happens to be more useful).
 468  * Also throws a Halibut error and falls back to CS_ASCII if the
 469  * charset is unrecognised, meaning the rest of the program can
 470  * rely on always getting a valid charset id back from this
 471  * function.
 472  */
 473 int charset_from_ustr(filepos *fpos, const wchar_t *name)
 474 {
 475     char *csname;
 476     int charset;
 477
 478     csname = utoa_dup(name, CS_ASCII);
 479     charset = charset_from_localenc(csname);
 480
 481     if (charset == CS_NONE) {
 482         charset = CS_ASCII;
 483         error(err_charset, fpos, name);
 484     }
 485
 486     sfree(csname);
 487     return charset;
 488 }