X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/halibut/blobdiff_plain/d7482997dd1ca71b70df43c15dd5956f435a1a7e..8f664e7e91c918cd13248f6b684580c4dd2cdb31:/ustring.c diff --git a/ustring.c b/ustring.c index 1573a19..3c5698c 100644 --- a/ustring.c +++ b/ustring.c @@ -3,37 +3,234 @@ */ #include +#include +#include #include #include "halibut.h" -wchar_t *ustrdup(wchar_t *s) { +wchar_t *ustrdup(wchar_t const *s) { wchar_t *r; if (s) { - r = mknewa(wchar_t, 1+ustrlen(s)); + r = snewn(1+ustrlen(s), wchar_t); ustrcpy(r, s); } else { - r = mknew(wchar_t); + r = snew(wchar_t); *r = 0; } return r; } -char *ustrtoa(wchar_t *s, char *outbuf, int size) { - char *p; +static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size, + int charset, int careful) { + int len, ret, err; + charset_state state = CHARSET_INIT_STATE; + if (!s) { *outbuf = '\0'; return outbuf; } - for (p = outbuf; *s && p < outbuf+size; p++,s++) - *p = *s; - if (p < outbuf+size) - *p = '\0'; - else - outbuf[size-1] = '\0'; + + len = ustrlen(s); + size--; /* leave room for terminating NUL */ + *outbuf = '\0'; + while (len > 0) { + err = 0; + ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state, + (careful ? &err : NULL)); + if (err) + return NULL; + if (!ret) + return outbuf; + size -= ret; + outbuf += ret; + *outbuf = '\0'; + } + /* + * Clean up + */ + ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL); + size -= ret; + outbuf += ret; + *outbuf = '\0'; + return outbuf; +} + +char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) { + return ustrtoa_internal(s, outbuf, size, charset, FALSE); +} + +char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) { + return ustrtoa_internal(s, outbuf, size, charset, TRUE); +} + +wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) { + int len, ret; + charset_state state = CHARSET_INIT_STATE; + + if (!s) { + *outbuf = L'\0'; + return outbuf; + } + + len = strlen(s); + size--; /* allow for terminating NUL */ + *outbuf = L'\0'; + while (len > 0) { + ret = charset_to_unicode(&s, &len, outbuf, size, + charset, &state, NULL, 0); + if (!ret) + return outbuf; + outbuf += ret; + size -= ret; + *outbuf = L'\0'; + } return outbuf; } -int ustrlen(wchar_t *s) { +char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) +{ + char *outbuf; + int outpos, outlen, len, ret, err; + charset_state state = CHARSET_INIT_STATE; + + if (!s) { + return dupstr(""); + } + + len = ustrlen(s); + + outlen = len + 10; + outbuf = snewn(outlen, char); + + outpos = 0; + outbuf[outpos] = '\0'; + + while (len > 0) { + err = 0; + ret = charset_from_unicode(&s, &len, + outbuf + outpos, outlen - outpos - 1, + charset, &state, (careful ? &err : NULL)); + if (err) { + sfree(outbuf); + return NULL; + } + if (!ret) { + outlen = outlen * 3 / 2; + outbuf = sresize(outbuf, outlen, char); + } + outpos += ret; + outbuf[outpos] = '\0'; + } + /* + * Clean up + */ + outlen = outpos + 32; + outbuf = sresize(outbuf, outlen, char); + ret = charset_from_unicode(NULL, 0, + outbuf + outpos, outlen - outpos + 1, + charset, &state, NULL); + outpos += ret; + outbuf[outpos] = '\0'; + if (lenp) + *lenp = outpos; + return outbuf; +} + +char *utoa_dup(wchar_t const *s, int charset) +{ + return utoa_internal_dup(s, charset, NULL, FALSE); +} + +char *utoa_dup_len(wchar_t const *s, int charset, int *len) +{ + return utoa_internal_dup(s, charset, len, FALSE); +} + +char *utoa_careful_dup(wchar_t const *s, int charset) +{ + return utoa_internal_dup(s, charset, NULL, TRUE); +} + +wchar_t *ufroma_dup(char const *s, int charset) { + int len; + wchar_t *buf = NULL; + + len = strlen(s) + 1; + do { + buf = sresize(buf, len, wchar_t); + ustrfroma(s, buf, len, charset); + len = (3 * len) / 2 + 1; /* this guarantees a strict increase */ + } while (ustrlen(buf) >= len-1); + + buf = sresize(buf, ustrlen(buf)+1, wchar_t); + return buf; +} + +char *utoa_locale_dup(wchar_t const *s) +{ + /* + * This variant uses the C library locale. + */ + char *ret; + int len, outlen; + size_t siz; + + len = ustrlen(s); + + outlen = 1 + MB_CUR_MAX * len; + ret = snewn(outlen+1, char); + + siz = wcstombs(ret, s, outlen); + + if (siz) { + assert(siz <= (size_t)(outlen)); + ret[siz] = '\0'; + ret = sresize(ret, siz+1, char); + return ret; + } + + /* + * If that failed, try a different strategy (which we will also + * attempt in the total absence of wcstombs). Retrieve the + * locale's charset from nl_langinfo or equivalent, and use + * normal utoa_dup. + */ + return utoa_dup(s, charset_from_locale()); +} + +wchar_t *ufroma_locale_dup(char const *s) +{ + /* + * This variant uses the C library locale. + */ + wchar_t *ret; + int len, outlen; + size_t siz; + + len = strlen(s); + + outlen = 1 + 2*len; + ret = snewn(outlen+1, wchar_t); /* be conservative */ + + siz = mbstowcs(ret, s, outlen); + + if (siz) { + assert(siz <= (size_t)(outlen)); + ret[siz] = L'\0'; + ret = sresize(ret, siz+1, wchar_t); + return ret; + } + + /* + * If that failed, try a different strategy (which we will also + * attempt in the total absence of wcstombs). Retrieve the + * locale's charset from nl_langinfo or equivalent, and use + * normal ufroma_dup. + */ + return ufroma_dup(s, charset_from_locale()); +} + +int ustrlen(wchar_t const *s) { int len = 0; while (*s++) len++; return len; @@ -43,7 +240,7 @@ wchar_t *uadv(wchar_t *s) { return s + 1 + ustrlen(s); } -wchar_t *ustrcpy(wchar_t *dest, wchar_t *source) { +wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) { wchar_t *ret = dest; do { *dest++ = *source; @@ -51,6 +248,15 @@ wchar_t *ustrcpy(wchar_t *dest, wchar_t *source) { return ret; } +wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) { + wchar_t *ret = dest; + do { + *dest++ = *source; + if (*source) source++; + } while (n-- > 0); + return ret; +} + int ustrcmp(wchar_t *lhs, wchar_t *rhs) { if (!lhs && !rhs) return 0; if (!lhs) return -1; @@ -67,13 +273,24 @@ int ustrcmp(wchar_t *lhs, wchar_t *rhs) { wchar_t utolower(wchar_t c) { if (c == L'\0') return c; /* this property needed by ustricmp */ - /* FIXME: this doesn't even come close */ +#ifdef HAS_TOWLOWER + return towlower(c); +#else if (c >= 'A' && c <= 'Z') c += 'a'-'A'; return c; +#endif } -int ustricmp(wchar_t *lhs, wchar_t *rhs) { +int uisalpha(wchar_t c) { +#ifdef HAS_ISWALPHA + return iswalpha(c); +#else + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +#endif +} + +int ustricmp(wchar_t const *lhs, wchar_t const *rhs) { wchar_t lc, rc; while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) lhs++, rhs++; @@ -85,6 +302,19 @@ int ustricmp(wchar_t *lhs, wchar_t *rhs) { return 1; } +int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) { + wchar_t lc = 0, rc = 0; + while (maxlen-- > 0 && + (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) + lhs++, rhs++; + if (lc < rc) + return -1; + else if (lc > rc) + return 1; + else + return 0; +} + wchar_t *ustrlow(wchar_t *s) { wchar_t *p = s; while (*p) { @@ -94,7 +324,7 @@ wchar_t *ustrlow(wchar_t *s) { return s; } -int utoi(wchar_t *s) { +int utoi(wchar_t const *s) { int sign = +1; int n; @@ -110,10 +340,18 @@ int utoi(wchar_t *s) { s++; } - return n; + return n * sign; } -int utob(wchar_t *s) { +double utof(wchar_t const *s) +{ + char *cs = utoa_dup(s, CS_ASCII); + double ret = atof(cs); + sfree(cs); + return ret; +} + +int utob(wchar_t const *s) { if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") || !ustricmp(s, L"true") || !ustricmp(s, L"t")) return TRUE; @@ -125,13 +363,9 @@ int uisdigit(wchar_t c) { } #define USTRFTIME_DELTA 128 -wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) { - void *blk = NULL; - wchar_t *wblk, *wp; - char *fmt, *text, *p; - size_t size = 0; - size_t len; - +static void ustrftime_internal(rdstring *rs, char formatchr, + const struct tm *timespec) +{ /* * strftime has the entertaining property that it returns 0 * _either_ on out-of-space _or_ on successful generation of @@ -139,36 +373,116 @@ wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) { * generate the empty string. Somebody throw a custard pie at * whoever was responsible for that. Please? */ - if (wfmt) { - len = ustrlen(wfmt); - fmt = mknewa(char, 2+len); - ustrtoa(wfmt, fmt+1, len+1); - fmt[0] = ' '; - } else - fmt = " %c"; - - while (1) { + +#ifdef HAS_WCSFTIME + wchar_t *buf = NULL; + wchar_t fmt[4]; + int size, ret; + + fmt[0] = L' '; + fmt[1] = L'%'; + /* Format chars are all ASCII, so conversion to Unicode is no problem */ + fmt[2] = formatchr; + fmt[3] = L'\0'; + + size = 0; + do { + size += USTRFTIME_DELTA; + buf = sresize(buf, size, wchar_t); + ret = (int) wcsftime(buf, size, fmt, timespec); + } while (ret == 0); + + rdadds(rs, buf+1); + sfree(buf); +#else + char *buf = NULL; + wchar_t *cvtbuf; + char fmt[4]; + int size, ret; + + fmt[0] = ' '; + fmt[1] = '%'; + fmt[2] = formatchr; + fmt[3] = '\0'; + + size = 0; + do { size += USTRFTIME_DELTA; - blk = resize((char *)blk, size); - len = strftime((char *)blk, size-1, fmt, timespec); - if (len > 0) - break; + buf = sresize(buf, size, char); + ret = (int) strftime(buf, size, fmt, timespec); + } while (ret == 0); + + cvtbuf = ufroma_locale_dup(buf+1); + rdadds(rs, cvtbuf); + sfree(cvtbuf); + sfree(buf); +#endif +} + +wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec) +{ + rdstring rs = { 0, 0, NULL }; + + if (!wfmt) + wfmt = L"%c"; + + while (*wfmt) { + if (wfmt[0] == L'%' && wfmt[1] == L'%') { + rdadd(&rs, L'%'); + wfmt += 2; + } else if (wfmt[0] == L'%' && wfmt[1]) { + ustrftime_internal(&rs, wfmt[1], timespec); + wfmt += 2; + } else { + rdadd(&rs, wfmt[0]); + wfmt++; + } } - /* Note: +1 for the terminating 0, -1 for the initial space in fmt */ - wblk = resize((wchar_t *)blk, len); - text = mknewa(char, len); - strftime(text, len, fmt+1, timespec); - /* - * We operate in the C locale, so this all ought to be kosher - * ASCII. If we ever move outside ASCII machines, we may need - * to make this more portable... - */ - for (wp = wblk, p = text; *p; p++, wp++) - *wp = *p; - *wp = 0; - if (wfmt) - sfree(fmt); - sfree(text); - return wblk; + return rdtrim(&rs); +} + +/* + * Determine whether a Unicode string can be translated into a + * given charset without any missing characters. + */ +int cvt_ok(int charset, const wchar_t *s) +{ + char buf[256]; + charset_state state = CHARSET_INIT_STATE; + int err, len = ustrlen(s); + + err = 0; + while (len > 0) { + (void)charset_from_unicode(&s, &len, buf, lenof(buf), + charset, &state, &err); + if (err) + return FALSE; + } + return TRUE; +} + +/* + * Wrapper around charset_from_localenc which accepts the charset + * name as a wide string (since that happens to be more useful). + * Also throws a Halibut error and falls back to CS_ASCII if the + * charset is unrecognised, meaning the rest of the program can + * rely on always getting a valid charset id back from this + * function. + */ +int charset_from_ustr(filepos *fpos, const wchar_t *name) +{ + char *csname; + int charset; + + csname = utoa_dup(name, CS_ASCII); + charset = charset_from_localenc(csname); + + if (charset == CS_NONE) { + charset = CS_ASCII; + error(err_charset, fpos, name); + } + + sfree(csname); + return charset; }