| 1 | /* |
| 2 | * ustring.c: Unicode string routines |
| 3 | */ |
| 4 | |
| 5 | #include <wchar.h> |
| 6 | #include <stdlib.h> |
| 7 | #include <assert.h> |
| 8 | #include <time.h> |
| 9 | #include "halibut.h" |
| 10 | |
| 11 | wchar_t *ustrdup(wchar_t const *s) { |
| 12 | wchar_t *r; |
| 13 | if (s) { |
| 14 | r = snewn(1+ustrlen(s), wchar_t); |
| 15 | ustrcpy(r, s); |
| 16 | } else { |
| 17 | r = snew(wchar_t); |
| 18 | *r = 0; |
| 19 | } |
| 20 | return r; |
| 21 | } |
| 22 | |
| 23 | static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size, |
| 24 | int charset, int careful) { |
| 25 | int len, ret, err; |
| 26 | charset_state state = CHARSET_INIT_STATE; |
| 27 | |
| 28 | if (!s) { |
| 29 | *outbuf = '\0'; |
| 30 | return outbuf; |
| 31 | } |
| 32 | |
| 33 | len = ustrlen(s); |
| 34 | size--; /* leave room for terminating NUL */ |
| 35 | *outbuf = '\0'; |
| 36 | while (len > 0) { |
| 37 | err = 0; |
| 38 | ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state, |
| 39 | (careful ? &err : NULL)); |
| 40 | if (err) |
| 41 | return NULL; |
| 42 | if (!ret) |
| 43 | return outbuf; |
| 44 | size -= ret; |
| 45 | outbuf += ret; |
| 46 | *outbuf = '\0'; |
| 47 | } |
| 48 | /* |
| 49 | * Clean up |
| 50 | */ |
| 51 | ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL); |
| 52 | size -= ret; |
| 53 | outbuf += ret; |
| 54 | *outbuf = '\0'; |
| 55 | return outbuf; |
| 56 | } |
| 57 | |
| 58 | char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) { |
| 59 | return ustrtoa_internal(s, outbuf, size, charset, FALSE); |
| 60 | } |
| 61 | |
| 62 | char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) { |
| 63 | return ustrtoa_internal(s, outbuf, size, charset, TRUE); |
| 64 | } |
| 65 | |
| 66 | wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) { |
| 67 | int len, ret; |
| 68 | charset_state state = CHARSET_INIT_STATE; |
| 69 | |
| 70 | if (!s) { |
| 71 | *outbuf = L'\0'; |
| 72 | return outbuf; |
| 73 | } |
| 74 | |
| 75 | len = strlen(s); |
| 76 | size--; /* allow for terminating NUL */ |
| 77 | *outbuf = L'\0'; |
| 78 | while (len > 0) { |
| 79 | ret = charset_to_unicode(&s, &len, outbuf, size, |
| 80 | charset, &state, NULL, 0); |
| 81 | if (!ret) |
| 82 | return outbuf; |
| 83 | outbuf += ret; |
| 84 | size -= ret; |
| 85 | *outbuf = L'\0'; |
| 86 | } |
| 87 | return outbuf; |
| 88 | } |
| 89 | |
| 90 | char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) |
| 91 | { |
| 92 | char *outbuf; |
| 93 | int outpos, outlen, len, ret, err; |
| 94 | charset_state state = CHARSET_INIT_STATE; |
| 95 | |
| 96 | if (!s) { |
| 97 | return dupstr(""); |
| 98 | } |
| 99 | |
| 100 | len = ustrlen(s); |
| 101 | |
| 102 | outlen = len + 10; |
| 103 | outbuf = snewn(outlen, char); |
| 104 | |
| 105 | outpos = 0; |
| 106 | outbuf[outpos] = '\0'; |
| 107 | |
| 108 | while (len > 0) { |
| 109 | err = 0; |
| 110 | ret = charset_from_unicode(&s, &len, |
| 111 | outbuf + outpos, outlen - outpos - 1, |
| 112 | charset, &state, (careful ? &err : NULL)); |
| 113 | if (err) { |
| 114 | sfree(outbuf); |
| 115 | return NULL; |
| 116 | } |
| 117 | if (!ret) { |
| 118 | outlen = outlen * 3 / 2; |
| 119 | outbuf = sresize(outbuf, outlen, char); |
| 120 | } |
| 121 | outpos += ret; |
| 122 | outbuf[outpos] = '\0'; |
| 123 | } |
| 124 | /* |
| 125 | * Clean up |
| 126 | */ |
| 127 | outlen = outpos + 32; |
| 128 | outbuf = sresize(outbuf, outlen, char); |
| 129 | ret = charset_from_unicode(NULL, 0, |
| 130 | outbuf + outpos, outlen - outpos + 1, |
| 131 | charset, &state, NULL); |
| 132 | outpos += ret; |
| 133 | outbuf[outpos] = '\0'; |
| 134 | if (lenp) |
| 135 | *lenp = outpos; |
| 136 | return outbuf; |
| 137 | } |
| 138 | |
| 139 | char *utoa_dup(wchar_t const *s, int charset) |
| 140 | { |
| 141 | return utoa_internal_dup(s, charset, NULL, FALSE); |
| 142 | } |
| 143 | |
| 144 | char *utoa_dup_len(wchar_t const *s, int charset, int *len) |
| 145 | { |
| 146 | return utoa_internal_dup(s, charset, len, FALSE); |
| 147 | } |
| 148 | |
| 149 | char *utoa_careful_dup(wchar_t const *s, int charset) |
| 150 | { |
| 151 | return utoa_internal_dup(s, charset, NULL, TRUE); |
| 152 | } |
| 153 | |
| 154 | wchar_t *ufroma_dup(char const *s, int charset) { |
| 155 | int len; |
| 156 | wchar_t *buf = NULL; |
| 157 | |
| 158 | len = strlen(s) + 1; |
| 159 | do { |
| 160 | buf = sresize(buf, len, wchar_t); |
| 161 | ustrfroma(s, buf, len, charset); |
| 162 | len = (3 * len) / 2 + 1; /* this guarantees a strict increase */ |
| 163 | } while (ustrlen(buf) >= len-1); |
| 164 | |
| 165 | buf = sresize(buf, ustrlen(buf)+1, wchar_t); |
| 166 | return buf; |
| 167 | } |
| 168 | |
| 169 | char *utoa_locale_dup(wchar_t const *s) |
| 170 | { |
| 171 | /* |
| 172 | * This variant uses the C library locale. |
| 173 | */ |
| 174 | char *ret; |
| 175 | int len; |
| 176 | size_t siz; |
| 177 | |
| 178 | len = ustrlen(s); |
| 179 | |
| 180 | ret = snewn(1 + MB_CUR_MAX * len, char); |
| 181 | |
| 182 | siz = wcstombs(ret, s, len); |
| 183 | |
| 184 | if (siz) { |
| 185 | assert(siz <= MB_CUR_MAX * len); |
| 186 | ret[siz] = '\0'; |
| 187 | ret = sresize(ret, siz+1, char); |
| 188 | return ret; |
| 189 | } |
| 190 | |
| 191 | /* |
| 192 | * If that failed, try a different strategy (which we will also |
| 193 | * attempt in the total absence of wcstombs). Retrieve the |
| 194 | * locale's charset from nl_langinfo or equivalent, and use |
| 195 | * normal utoa_dup. |
| 196 | */ |
| 197 | return utoa_dup(s, charset_from_locale()); |
| 198 | } |
| 199 | |
| 200 | wchar_t *ufroma_locale_dup(char const *s) |
| 201 | { |
| 202 | /* |
| 203 | * This variant uses the C library locale. |
| 204 | */ |
| 205 | wchar_t *ret; |
| 206 | int len; |
| 207 | size_t siz; |
| 208 | |
| 209 | len = strlen(s); |
| 210 | |
| 211 | ret = snewn(1 + 2*len, wchar_t); /* be conservative */ |
| 212 | |
| 213 | siz = mbstowcs(ret, s, len); |
| 214 | |
| 215 | if (siz) { |
| 216 | assert(siz <= (size_t)(2 * len)); |
| 217 | ret[siz] = L'\0'; |
| 218 | ret = sresize(ret, siz+1, wchar_t); |
| 219 | return ret; |
| 220 | } |
| 221 | |
| 222 | /* |
| 223 | * If that failed, try a different strategy (which we will also |
| 224 | * attempt in the total absence of wcstombs). Retrieve the |
| 225 | * locale's charset from nl_langinfo or equivalent, and use |
| 226 | * normal ufroma_dup. |
| 227 | */ |
| 228 | return ufroma_dup(s, charset_from_locale()); |
| 229 | } |
| 230 | |
| 231 | int ustrlen(wchar_t const *s) { |
| 232 | int len = 0; |
| 233 | while (*s++) len++; |
| 234 | return len; |
| 235 | } |
| 236 | |
| 237 | wchar_t *uadv(wchar_t *s) { |
| 238 | return s + 1 + ustrlen(s); |
| 239 | } |
| 240 | |
| 241 | wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) { |
| 242 | wchar_t *ret = dest; |
| 243 | do { |
| 244 | *dest++ = *source; |
| 245 | } while (*source++); |
| 246 | return ret; |
| 247 | } |
| 248 | |
| 249 | wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) { |
| 250 | wchar_t *ret = dest; |
| 251 | do { |
| 252 | *dest++ = *source; |
| 253 | if (*source) source++; |
| 254 | } while (n-- > 0); |
| 255 | return ret; |
| 256 | } |
| 257 | |
| 258 | int ustrcmp(wchar_t *lhs, wchar_t *rhs) { |
| 259 | if (!lhs && !rhs) return 0; |
| 260 | if (!lhs) return -1; |
| 261 | if (!rhs) return +1; |
| 262 | while (*lhs && *rhs && *lhs==*rhs) |
| 263 | lhs++, rhs++; |
| 264 | if (*lhs < *rhs) |
| 265 | return -1; |
| 266 | else if (*lhs > *rhs) |
| 267 | return 1; |
| 268 | return 0; |
| 269 | } |
| 270 | |
| 271 | wchar_t utolower(wchar_t c) { |
| 272 | if (c == L'\0') |
| 273 | return c; /* this property needed by ustricmp */ |
| 274 | #ifdef HAS_TOWLOWER |
| 275 | return towlower(c); |
| 276 | #else |
| 277 | if (c >= 'A' && c <= 'Z') |
| 278 | c += 'a'-'A'; |
| 279 | return c; |
| 280 | #endif |
| 281 | } |
| 282 | |
| 283 | int uisalpha(wchar_t c) { |
| 284 | #ifdef HAS_ISWALPHA |
| 285 | return iswalpha(c); |
| 286 | #else |
| 287 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
| 288 | #endif |
| 289 | } |
| 290 | |
| 291 | int ustricmp(wchar_t const *lhs, wchar_t const *rhs) { |
| 292 | wchar_t lc, rc; |
| 293 | while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
| 294 | lhs++, rhs++; |
| 295 | if (!lc && !rc) |
| 296 | return 0; |
| 297 | if (lc < rc) |
| 298 | return -1; |
| 299 | else |
| 300 | return 1; |
| 301 | } |
| 302 | |
| 303 | int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) { |
| 304 | wchar_t lc = 0, rc = 0; |
| 305 | while (maxlen-- > 0 && |
| 306 | (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
| 307 | lhs++, rhs++; |
| 308 | if (lc < rc) |
| 309 | return -1; |
| 310 | else if (lc > rc) |
| 311 | return 1; |
| 312 | else |
| 313 | return 0; |
| 314 | } |
| 315 | |
| 316 | wchar_t *ustrlow(wchar_t *s) { |
| 317 | wchar_t *p = s; |
| 318 | while (*p) { |
| 319 | *p = utolower(*p); |
| 320 | p++; |
| 321 | } |
| 322 | return s; |
| 323 | } |
| 324 | |
| 325 | int utoi(wchar_t const *s) { |
| 326 | int sign = +1; |
| 327 | int n; |
| 328 | |
| 329 | if (*s == L'-') { |
| 330 | s++; |
| 331 | sign = -1; |
| 332 | } |
| 333 | |
| 334 | n = 0; |
| 335 | while (*s && *s >= L'0' && *s <= L'9') { |
| 336 | n *= 10; |
| 337 | n += (*s - '0'); |
| 338 | s++; |
| 339 | } |
| 340 | |
| 341 | return n; |
| 342 | } |
| 343 | |
| 344 | double utof(wchar_t const *s) |
| 345 | { |
| 346 | char *cs = utoa_dup(s, CS_ASCII); |
| 347 | double ret = atof(cs); |
| 348 | sfree(cs); |
| 349 | return ret; |
| 350 | } |
| 351 | |
| 352 | int utob(wchar_t const *s) { |
| 353 | if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") || |
| 354 | !ustricmp(s, L"true") || !ustricmp(s, L"t")) |
| 355 | return TRUE; |
| 356 | return FALSE; |
| 357 | } |
| 358 | |
| 359 | int uisdigit(wchar_t c) { |
| 360 | return c >= L'0' && c <= L'9'; |
| 361 | } |
| 362 | |
| 363 | #define USTRFTIME_DELTA 128 |
| 364 | static void ustrftime_internal(rdstring *rs, char formatchr, |
| 365 | const struct tm *timespec) |
| 366 | { |
| 367 | /* |
| 368 | * strftime has the entertaining property that it returns 0 |
| 369 | * _either_ on out-of-space _or_ on successful generation of |
| 370 | * the empty string. Hence we must ensure our format can never |
| 371 | * generate the empty string. Somebody throw a custard pie at |
| 372 | * whoever was responsible for that. Please? |
| 373 | */ |
| 374 | |
| 375 | #ifdef HAS_WCSFTIME |
| 376 | wchar_t *buf = NULL; |
| 377 | wchar_t fmt[4]; |
| 378 | int size, ret; |
| 379 | |
| 380 | fmt[0] = L' '; |
| 381 | fmt[1] = L'%'; |
| 382 | /* Format chars are all ASCII, so conversion to Unicode is no problem */ |
| 383 | fmt[2] = formatchr; |
| 384 | fmt[3] = L'\0'; |
| 385 | |
| 386 | size = 0; |
| 387 | do { |
| 388 | size += USTRFTIME_DELTA; |
| 389 | buf = sresize(buf, size, wchar_t); |
| 390 | ret = (int) wcsftime(buf, size, fmt, timespec); |
| 391 | } while (ret == 0); |
| 392 | |
| 393 | rdadds(rs, buf+1); |
| 394 | sfree(buf); |
| 395 | #else |
| 396 | char *buf = NULL; |
| 397 | wchar_t *cvtbuf; |
| 398 | char fmt[4]; |
| 399 | int size, ret; |
| 400 | |
| 401 | fmt[0] = ' '; |
| 402 | fmt[1] = '%'; |
| 403 | fmt[2] = formatchr; |
| 404 | fmt[3] = '\0'; |
| 405 | |
| 406 | size = 0; |
| 407 | do { |
| 408 | size += USTRFTIME_DELTA; |
| 409 | buf = sresize(buf, size, char); |
| 410 | ret = (int) strftime(buf, size, fmt, timespec); |
| 411 | } while (ret == 0); |
| 412 | |
| 413 | cvtbuf = ufroma_locale_dup(buf+1); |
| 414 | rdadds(rs, cvtbuf); |
| 415 | sfree(cvtbuf); |
| 416 | sfree(buf); |
| 417 | #endif |
| 418 | } |
| 419 | |
| 420 | wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec) |
| 421 | { |
| 422 | rdstring rs = { 0, 0, NULL }; |
| 423 | |
| 424 | if (!wfmt) |
| 425 | wfmt = L"%c"; |
| 426 | |
| 427 | while (*wfmt) { |
| 428 | if (wfmt[0] == L'%' && wfmt[1] == L'%') { |
| 429 | rdadd(&rs, L'%'); |
| 430 | wfmt += 2; |
| 431 | } else if (wfmt[0] == L'%' && wfmt[1]) { |
| 432 | ustrftime_internal(&rs, wfmt[1], timespec); |
| 433 | wfmt += 2; |
| 434 | } else { |
| 435 | rdadd(&rs, wfmt[0]); |
| 436 | wfmt++; |
| 437 | } |
| 438 | } |
| 439 | |
| 440 | return rdtrim(&rs); |
| 441 | } |
| 442 | |
| 443 | /* |
| 444 | * Determine whether a Unicode string can be translated into a |
| 445 | * given charset without any missing characters. |
| 446 | */ |
| 447 | int cvt_ok(int charset, const wchar_t *s) |
| 448 | { |
| 449 | char buf[256]; |
| 450 | charset_state state = CHARSET_INIT_STATE; |
| 451 | int err, len = ustrlen(s); |
| 452 | |
| 453 | err = 0; |
| 454 | while (len > 0) { |
| 455 | (void)charset_from_unicode(&s, &len, buf, lenof(buf), |
| 456 | charset, &state, &err); |
| 457 | if (err) |
| 458 | return FALSE; |
| 459 | } |
| 460 | return TRUE; |
| 461 | } |