mdw@git.distorted.org.uk Git - sgt/halibut/blob - ustring.c

   1 /*
   2  * ustring.c: Unicode string routines
   3  */
   4
   5 #include <wchar.h>
   6 #include <stdlib.h>
   7 #include <assert.h>
   8 #include <time.h>
   9 #include "halibut.h"
  10
  11 wchar_t *ustrdup(wchar_t const *s) {
  12     wchar_t *r;
  13     if (s) {
  14         r = mknewa(wchar_t, 1+ustrlen(s));
  15         ustrcpy(r, s);
  16     } else {
  17         r = mknew(wchar_t);
  18         *r = 0;
  19     }
  20     return r;
  21 }
  22
  23 static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
  24                               int charset, int careful) {
  25     int len, ret, err;
  26     charset_state state = CHARSET_INIT_STATE;
  27
  28     if (!s) {
  29         *outbuf = '\0';
  30         return outbuf;
  31     }
  32
  33     len = ustrlen(s);
  34     size--;                            /* leave room for terminating NUL */
  35     *outbuf = '\0';
  36     while (len > 0) {
  37         err = 0;
  38         ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
  39                                    (careful ? &err : NULL));
  40         if (err)
  41             return NULL;
  42         if (!ret)
  43             return outbuf;
  44         size -= ret;
  45         outbuf += ret;
  46         *outbuf = '\0';
  47     }
  48     /*
  49      * Clean up
  50      */
  51     ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
  52     size -= ret;
  53     outbuf += ret;
  54     *outbuf = '\0';
  55     return outbuf;
  56 }
  57
  58 char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
  59     return ustrtoa_internal(s, outbuf, size, charset, FALSE);
  60 }
  61
  62 char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
  63     return ustrtoa_internal(s, outbuf, size, charset, TRUE);
  64 }
  65
  66 wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
  67     int len, ret;
  68     charset_state state = CHARSET_INIT_STATE;
  69
  70     if (!s) {
  71         *outbuf = L'\0';
  72         return outbuf;
  73     }
  74
  75     len = strlen(s);
  76     size--;                            /* allow for terminating NUL */
  77     *outbuf = L'\0';
  78     while (len > 0) {
  79         ret = charset_to_unicode(&s, &len, outbuf, size,
  80                                  charset, &state, NULL, 0);
  81         if (!ret)
  82             return outbuf;
  83         outbuf += ret;
  84         size -= ret;
  85         *outbuf = L'\0';
  86     }
  87     return outbuf;
  88 }
  89
  90 char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
  91 {
  92     char *outbuf;
  93     int outpos, outlen, len, ret, err;
  94     charset_state state = CHARSET_INIT_STATE;
  95
  96     if (!s) {
  97         return dupstr("");
  98     }
  99
 100     len = ustrlen(s);
 101
 102     outlen = len + 10;
 103     outbuf = mknewa(char, outlen);
 104
 105     outpos = 0;
 106     outbuf[outpos] = '\0';
 107
 108     while (len > 0) {
 109         err = 0;
 110         ret = charset_from_unicode(&s, &len,
 111                                    outbuf + outpos, outlen - outpos - 1,
 112                                    charset, &state, (careful ? &err : NULL));
 113         if (err) {
 114             sfree(outbuf);
 115             return NULL;
 116         }
 117         if (!ret) {
 118             outlen = outlen * 3 / 2;
 119             outbuf = resize(outbuf, outlen);
 120         }
 121         outpos += ret;
 122         outbuf[outpos] = '\0';
 123     }
 124     /*
 125      * Clean up
 126      */
 127     outlen = outpos + 32;
 128     outbuf = resize(outbuf, outlen);
 129     ret = charset_from_unicode(NULL, 0,
 130                                outbuf + outpos, outlen - outpos + 1,
 131                                charset, &state, NULL);
 132     outpos += ret;
 133     outbuf[outpos] = '\0';
 134     if (lenp)
 135         *lenp = outpos;
 136     return outbuf;
 137 }
 138
 139 char *utoa_dup(wchar_t const *s, int charset)
 140 {
 141     return utoa_internal_dup(s, charset, NULL, FALSE);
 142 }
 143
 144 char *utoa_dup_len(wchar_t const *s, int charset, int *len)
 145 {
 146     return utoa_internal_dup(s, charset, len, FALSE);
 147 }
 148
 149 char *utoa_careful_dup(wchar_t const *s, int charset)
 150 {
 151     return utoa_internal_dup(s, charset, NULL, TRUE);
 152 }
 153
 154 wchar_t *ufroma_dup(char const *s, int charset) {
 155     int len;
 156     wchar_t *buf = NULL;
 157
 158     len = strlen(s) + 1;
 159     do {
 160         buf = resize(buf, len);
 161         ustrfroma(s, buf, len, charset);
 162         len = (3 * len) / 2 + 1;       /* this guarantees a strict increase */
 163     } while (ustrlen(buf) >= len-1);
 164
 165     buf = resize(buf, ustrlen(buf)+1);
 166     return buf;
 167 }
 168
 169 char *utoa_locale_dup(wchar_t const *s)
 170 {
 171     /*
 172      * This variant uses the C library locale.
 173      */
 174     char *ret;
 175     int len;
 176     size_t siz;
 177
 178     len = ustrlen(s);
 179
 180     ret = mknewa(char, 1 + MB_CUR_MAX * len);
 181
 182     siz = wcstombs(ret, s, len);
 183
 184     if (siz) {
 185         assert(siz <= MB_CUR_MAX * len);
 186         ret[siz] = '\0';
 187         ret = resize(ret, siz+1);
 188         return ret;
 189     }
 190
 191     /*
 192      * If that failed, try a different strategy (which we will also
 193      * attempt in the total absence of wcstombs). Retrieve the
 194      * locale's charset from nl_langinfo or equivalent, and use
 195      * normal utoa_dup.
 196      */
 197     return utoa_dup(s, charset_from_locale());
 198 }
 199
 200 wchar_t *ufroma_locale_dup(char const *s)
 201 {
 202     /*
 203      * This variant uses the C library locale.
 204      */
 205     wchar_t *ret;
 206     int len;
 207     size_t siz;
 208
 209     len = strlen(s);
 210
 211     ret = mknewa(wchar_t, 1 + 2*len);  /* be conservative */
 212
 213     siz = mbstowcs(ret, s, len);
 214
 215     if (siz) {
 216         assert(siz <= (size_t)(2 * len));
 217         ret[siz] = L'\0';
 218         ret = resize(ret, siz+1);
 219         return ret;
 220     }
 221
 222     /*
 223      * If that failed, try a different strategy (which we will also
 224      * attempt in the total absence of wcstombs). Retrieve the
 225      * locale's charset from nl_langinfo or equivalent, and use
 226      * normal ufroma_dup.
 227      */
 228     return ufroma_dup(s, charset_from_locale());
 229 }
 230
 231 int ustrlen(wchar_t const *s) {
 232     int len = 0;
 233     while (*s++) len++;
 234     return len;
 235 }
 236
 237 wchar_t *uadv(wchar_t *s) {
 238     return s + 1 + ustrlen(s);
 239 }
 240
 241 wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
 242     wchar_t *ret = dest;
 243     do {
 244         *dest++ = *source;
 245     } while (*source++);
 246     return ret;
 247 }
 248
 249 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
 250     if (!lhs && !rhs) return 0;
 251     if (!lhs) return -1;
 252     if (!rhs) return +1;
 253     while (*lhs && *rhs && *lhs==*rhs)
 254         lhs++, rhs++;
 255     if (*lhs < *rhs)
 256         return -1;
 257     else if (*lhs > *rhs)
 258         return 1;
 259     return 0;
 260 }
 261
 262 wchar_t utolower(wchar_t c) {
 263     if (c == L'\0')
 264         return c;                      /* this property needed by ustricmp */
 265 #ifdef HAS_TOWLOWER
 266     return towlower(c);
 267 #else
 268     if (c >= 'A' && c <= 'Z')
 269         c += 'a'-'A';
 270     return c;
 271 #endif
 272 }
 273
 274 int uisalpha(wchar_t c) {
 275 #ifdef HAS_ISWALPHA
 276     return iswalpha(c);
 277 #else
 278     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
 279 #endif
 280 }
 281
 282 int ustricmp(wchar_t *lhs, wchar_t *rhs) {
 283     wchar_t lc, rc;
 284     while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
 285         lhs++, rhs++;
 286     if (!lc && !rc)
 287         return 0;
 288     if (lc < rc)
 289         return -1;
 290     else
 291         return 1;
 292 }
 293
 294 wchar_t *ustrlow(wchar_t *s) {
 295     wchar_t *p = s;
 296     while (*p) {
 297         *p = utolower(*p);
 298         p++;
 299     }
 300     return s;
 301 }
 302
 303 int utoi(wchar_t *s) {
 304     int sign = +1;
 305     int n;
 306
 307     if (*s == L'-') {
 308         s++;
 309         sign = -1;
 310     }
 311
 312     n = 0;
 313     while (*s && *s >= L'0' && *s <= L'9') {
 314         n *= 10;
 315         n += (*s - '0');
 316         s++;
 317     }
 318
 319     return n;
 320 }
 321
 322 int utob(wchar_t *s) {
 323     if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
 324         !ustricmp(s, L"true") || !ustricmp(s, L"t"))
 325         return TRUE;
 326     return FALSE;
 327 }
 328
 329 int uisdigit(wchar_t c) {
 330     return c >= L'0' && c <= L'9';
 331 }
 332
 333 #define USTRFTIME_DELTA 128
 334 static void ustrftime_internal(rdstring *rs, char formatchr,
 335                                const struct tm *timespec)
 336 {
 337     /*
 338      * strftime has the entertaining property that it returns 0
 339      * _either_ on out-of-space _or_ on successful generation of
 340      * the empty string. Hence we must ensure our format can never
 341      * generate the empty string. Somebody throw a custard pie at
 342      * whoever was responsible for that. Please?
 343      */
 344
 345 #ifdef HAS_WCSFTIME
 346     wchar_t *buf = NULL;
 347     wchar_t fmt[4];
 348     int size, ret;
 349
 350     fmt[0] = L' ';
 351     fmt[1] = L'%';
 352     /* Format chars are all ASCII, so conversion to Unicode is no problem */
 353     fmt[2] = formatchr;
 354     fmt[3] = L'\0';
 355
 356     size = 0;
 357     do {
 358         size += USTRFTIME_DELTA;
 359         buf = resize(buf, size);
 360         ret = (int) wcsftime(buf, size, fmt, timespec);
 361     } while (ret == 0);
 362
 363     rdadds(rs, buf+1);
 364     sfree(buf);
 365 #else
 366     char *buf = NULL;
 367     wchar_t *cvtbuf;
 368     char fmt[4];
 369     int size, ret;
 370
 371     fmt[0] = ' ';
 372     fmt[1] = '%';
 373     fmt[2] = formatchr;
 374     fmt[3] = '\0';
 375
 376     size = 0;
 377     do {
 378         size += USTRFTIME_DELTA;
 379         buf = resize(buf, size);
 380         ret = (int) strftime(buf, size, fmt, timespec);
 381     } while (ret == 0);
 382
 383     cvtbuf = ufroma_locale_dup(buf+1);
 384     rdadds(rs, cvtbuf);
 385     sfree(cvtbuf);
 386     sfree(buf);
 387 #endif
 388 }
 389
 390 wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec)
 391 {
 392     rdstring rs = { 0, 0, NULL };
 393
 394     if (!wfmt)
 395         wfmt = L"%c";
 396
 397     while (*wfmt) {
 398         if (wfmt[0] == L'%' && wfmt[1] == L'%') {
 399             rdadd(&rs, L'%');
 400             wfmt += 2;
 401         } else if (wfmt[0] == L'%' && wfmt[1]) {
 402             ustrftime_internal(&rs, wfmt[1], timespec);
 403             wfmt += 2;
 404         } else {
 405             rdadd(&rs, wfmt[0]);
 406             wfmt++;
 407         }
 408     }
 409
 410     return rdtrim(&rs);
 411 }
 412
 413 /*
 414  * Determine whether a Unicode string can be translated into a
 415  * given charset without any missing characters.
 416  */
 417 int cvt_ok(int charset, const wchar_t *s)
 418 {
 419     char buf[256];
 420     charset_state state = CHARSET_INIT_STATE;
 421     int err, len = ustrlen(s);
 422
 423     err = 0;
 424     while (len > 0) {
 425         (void)charset_from_unicode(&s, &len, buf, lenof(buf),
 426                                    charset, &state, &err);
 427         if (err)
 428             return FALSE;
 429     }
 430     return TRUE;
 431 }