mdw@git.distorted.org.uk Git - sgt/halibut/blob - ustring.c

   1 /*
   2  * ustring.c: Unicode string routines
   3  */
   4
   5 #include <wchar.h>
   6 #include <stdlib.h>
   7 #include <assert.h>
   8 #include <time.h>
   9 #include "halibut.h"
  10
  11 wchar_t *ustrdup(wchar_t const *s) {
  12     wchar_t *r;
  13     if (s) {
  14         r = mknewa(wchar_t, 1+ustrlen(s));
  15         ustrcpy(r, s);
  16     } else {
  17         r = mknew(wchar_t);
  18         *r = 0;
  19     }
  20     return r;
  21 }
  22
  23 static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
  24                               int charset, int careful) {
  25     int len, ret, err;
  26     charset_state state = CHARSET_INIT_STATE;
  27
  28     if (!s) {
  29         *outbuf = '\0';
  30         return outbuf;
  31     }
  32
  33     len = ustrlen(s);
  34     size--;                            /* leave room for terminating NUL */
  35     *outbuf = '\0';
  36     while (len > 0) {
  37         err = 0;
  38         ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
  39                                    (careful ? &err : NULL));
  40         if (err)
  41             return NULL;
  42         if (!ret)
  43             return outbuf;
  44         size -= ret;
  45         outbuf += ret;
  46         *outbuf = '\0';
  47     }
  48     /*
  49      * Clean up
  50      */
  51     ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
  52     size -= ret;
  53     outbuf += ret;
  54     *outbuf = '\0';
  55     return outbuf;
  56 }
  57
  58 char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
  59     return ustrtoa_internal(s, outbuf, size, charset, FALSE);
  60 }
  61
  62 char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
  63     return ustrtoa_internal(s, outbuf, size, charset, TRUE);
  64 }
  65
  66 wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
  67     int len, ret;
  68     charset_state state = CHARSET_INIT_STATE;
  69
  70     if (!s) {
  71         *outbuf = L'\0';
  72         return outbuf;
  73     }
  74
  75     len = strlen(s);
  76     size--;                            /* allow for terminating NUL */
  77     *outbuf = L'\0';
  78     while (len > 0) {
  79         ret = charset_to_unicode(&s, &len, outbuf, size,
  80                                  charset, &state, NULL, 0);
  81         if (!ret)
  82             return outbuf;
  83         outbuf += ret;
  84         size -= ret;
  85         *outbuf = L'\0';
  86     }
  87     return outbuf;
  88 }
  89
  90 char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
  91 {
  92     char *outbuf;
  93     int outpos, outlen, len, ret, err;
  94     charset_state state = CHARSET_INIT_STATE;
  95
  96     if (!s) {
  97         return dupstr("");
  98     }
  99
 100     len = ustrlen(s);
 101
 102     outlen = len + 10;
 103     outbuf = mknewa(char, outlen);
 104
 105     outpos = 0;
 106     outbuf[outpos] = '\0';
 107
 108     while (len > 0) {
 109         err = 0;
 110         ret = charset_from_unicode(&s, &len,
 111                                    outbuf + outpos, outlen - outpos - 1,
 112                                    charset, &state, (careful ? &err : NULL));
 113         if (err) {
 114             sfree(outbuf);
 115             return NULL;
 116         }
 117         if (!ret) {
 118             outlen = outlen * 3 / 2;
 119             outbuf = resize(outbuf, outlen);
 120         }
 121         outpos += ret;
 122         outbuf[outpos] = '\0';
 123     }
 124     /*
 125      * Clean up
 126      */
 127     outlen = outpos + 32;
 128     outbuf = resize(outbuf, outlen);
 129     ret = charset_from_unicode(NULL, 0,
 130                                outbuf + outpos, outlen - outpos + 1,
 131                                charset, &state, NULL);
 132     outpos += ret;
 133     outbuf[outpos] = '\0';
 134     if (lenp)
 135         *lenp = outpos;
 136     return outbuf;
 137 }
 138
 139 char *utoa_dup(wchar_t const *s, int charset)
 140 {
 141     return utoa_internal_dup(s, charset, NULL, FALSE);
 142 }
 143
 144 char *utoa_dup_len(wchar_t const *s, int charset, int *len)
 145 {
 146     return utoa_internal_dup(s, charset, len, FALSE);
 147 }
 148
 149 char *utoa_careful_dup(wchar_t const *s, int charset)
 150 {
 151     return utoa_internal_dup(s, charset, NULL, TRUE);
 152 }
 153
 154 wchar_t *ufroma_dup(char const *s, int charset) {
 155     int len;
 156     wchar_t *buf = NULL;
 157
 158     len = strlen(s) + 1;
 159     do {
 160         buf = resize(buf, len);
 161         ustrfroma(s, buf, len, charset);
 162         len = (3 * len) / 2 + 1;       /* this guarantees a strict increase */
 163     } while (ustrlen(buf) >= len-1);
 164
 165     buf = resize(buf, ustrlen(buf)+1);
 166     return buf;
 167 }
 168
 169 char *utoa_locale_dup(wchar_t const *s)
 170 {
 171     /*
 172      * This variant uses the C library locale.
 173      */
 174     char *ret;
 175     int len;
 176     size_t siz;
 177
 178     len = ustrlen(s);
 179
 180     ret = mknewa(char, 1 + MB_CUR_MAX * len);
 181
 182     siz = wcstombs(ret, s, len);
 183
 184     if (siz) {
 185         assert(siz <= MB_CUR_MAX * len);
 186         ret[siz] = '\0';
 187         ret = resize(ret, siz+1);
 188         return ret;
 189     }
 190
 191     /*
 192      * If that failed, try a different strategy (which we will also
 193      * attempt in the total absence of wcstombs). Retrieve the
 194      * locale's charset from nl_langinfo or equivalent, and use
 195      * normal utoa_dup.
 196      */
 197     return utoa_dup(s, charset_from_locale());
 198 }
 199
 200 wchar_t *ufroma_locale_dup(char const *s)
 201 {
 202     /*
 203      * This variant uses the C library locale.
 204      */
 205     wchar_t *ret;
 206     int len;
 207     size_t siz;
 208
 209     len = strlen(s);
 210
 211     ret = mknewa(wchar_t, 1 + 2*len);  /* be conservative */
 212
 213     siz = mbstowcs(ret, s, len);
 214
 215     if (siz) {
 216         assert(siz <= (size_t)(2 * len));
 217         ret[siz] = L'\0';
 218         ret = resize(ret, siz+1);
 219         return ret;
 220     }
 221
 222     /*
 223      * If that failed, try a different strategy (which we will also
 224      * attempt in the total absence of wcstombs). Retrieve the
 225      * locale's charset from nl_langinfo or equivalent, and use
 226      * normal ufroma_dup.
 227      */
 228     return ufroma_dup(s, charset_from_locale());
 229 }
 230
 231 int ustrlen(wchar_t const *s) {
 232     int len = 0;
 233     while (*s++) len++;
 234     return len;
 235 }
 236
 237 wchar_t *uadv(wchar_t *s) {
 238     return s + 1 + ustrlen(s);
 239 }
 240
 241 wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
 242     wchar_t *ret = dest;
 243     do {
 244         *dest++ = *source;
 245     } while (*source++);
 246     return ret;
 247 }
 248
 249 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
 250     if (!lhs && !rhs) return 0;
 251     if (!lhs) return -1;
 252     if (!rhs) return +1;
 253     while (*lhs && *rhs && *lhs==*rhs)
 254         lhs++, rhs++;
 255     if (*lhs < *rhs)
 256         return -1;
 257     else if (*lhs > *rhs)
 258         return 1;
 259     return 0;
 260 }
 261
 262 wchar_t utolower(wchar_t c) {
 263     if (c == L'\0')
 264         return c;                      /* this property needed by ustricmp */
 265 #ifdef HAS_TOWLOWER
 266     return towlower(c);
 267 #else
 268     if (c >= 'A' && c <= 'Z')
 269         c += 'a'-'A';
 270     return c;
 271 #endif
 272 }
 273
 274 int uisalpha(wchar_t c) {
 275 #ifdef HAS_ISWALPHA
 276     return iswalpha(c);
 277 #else
 278     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
 279 #endif
 280 }
 281
 282 int ustricmp(wchar_t *lhs, wchar_t *rhs) {
 283     wchar_t lc, rc;
 284     while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
 285         lhs++, rhs++;
 286     if (!lc && !rc)
 287         return 0;
 288     if (lc < rc)
 289         return -1;
 290     else
 291         return 1;
 292 }
 293
 294 wchar_t *ustrlow(wchar_t *s) {
 295     wchar_t *p = s;
 296     while (*p) {
 297         *p = utolower(*p);
 298         p++;
 299     }
 300     return s;
 301 }
 302
 303 int utoi(wchar_t *s) {
 304     int sign = +1;
 305     int n;
 306
 307     if (*s == L'-') {
 308         s++;
 309         sign = -1;
 310     }
 311
 312     n = 0;
 313     while (*s && *s >= L'0' && *s <= L'9') {
 314         n *= 10;
 315         n += (*s - '0');
 316         s++;
 317     }
 318
 319     return n;
 320 }
 321
 322 int utob(wchar_t *s) {
 323     if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
 324         !ustricmp(s, L"true") || !ustricmp(s, L"t"))
 325         return TRUE;
 326     return FALSE;
 327 }
 328
 329 int uisdigit(wchar_t c) {
 330     return c >= L'0' && c <= L'9';
 331 }
 332
 333 #define USTRFTIME_DELTA 128
 334 wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) {
 335     void *blk = NULL;
 336     wchar_t *wblk, *wp;
 337     char *fmt, *text, *p;
 338     size_t size = 0;
 339     size_t len;
 340
 341     /*
 342      * FIXME: really we ought to copy non-% parts of the format
 343      * ourselves, and only resort to strftime for % parts. Also we
 344      * should use wcsftime if it's present.
 345      */
 346
 347     /*
 348      * strftime has the entertaining property that it returns 0
 349      * _either_ on out-of-space _or_ on successful generation of
 350      * the empty string. Hence we must ensure our format can never
 351      * generate the empty string. Somebody throw a custard pie at
 352      * whoever was responsible for that. Please?
 353      */
 354     if (wfmt) {
 355         len = ustrlen(wfmt);
 356         fmt = mknewa(char, 2+len);
 357         ustrtoa(wfmt, fmt+1, len+1, CS_ASCII);   /* CS_FIXME? */
 358         fmt[0] = ' ';
 359     } else
 360         fmt = " %c";
 361
 362     while (1) {
 363         size += USTRFTIME_DELTA;
 364         blk = resize((char *)blk, size);
 365         len = strftime((char *)blk, size-1, fmt, timespec);
 366         if (len > 0)
 367             break;
 368     }
 369
 370     /* Note: +1 for the terminating 0, -1 for the initial space in fmt */
 371     wblk = resize((wchar_t *)blk, len);
 372     text = mknewa(char, len);
 373     strftime(text, len, fmt+1, timespec);
 374     /*
 375      * We operate in the C locale, so this all ought to be kosher
 376      * ASCII. If we ever move outside ASCII machines, we may need
 377      * to make this more portable...
 378      */
 379     for (wp = wblk, p = text; *p; p++, wp++)
 380         *wp = *p;
 381     *wp = 0;
 382     if (wfmt)
 383         sfree(fmt);
 384     sfree(text);
 385     return wblk;
 386 }
 387
 388 /*
 389  * Determine whether a Unicode string can be translated into a
 390  * given charset without any missing characters.
 391  */
 392 int cvt_ok(int charset, const wchar_t *s)
 393 {
 394     char buf[256];
 395     charset_state state = CHARSET_INIT_STATE;
 396     int err, len = ustrlen(s);
 397
 398     err = 0;
 399     while (len > 0) {
 400         (void)charset_from_unicode(&s, &len, buf, lenof(buf),
 401                                    charset, &state, &err);
 402         if (err)
 403             return FALSE;
 404     }
 405     return TRUE;
 406 }