X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/halibut/blobdiff_plain/91f93b94744447a088ce435e50500a9598cb5466..8f664e7e91c918cd13248f6b684580c4dd2cdb31:/ustring.c diff --git a/ustring.c b/ustring.c index 11a022c..3c5698c 100644 --- a/ustring.c +++ b/ustring.c @@ -3,16 +3,18 @@ */ #include +#include +#include #include #include "halibut.h" wchar_t *ustrdup(wchar_t const *s) { wchar_t *r; if (s) { - r = mknewa(wchar_t, 1+ustrlen(s)); + r = snewn(1+ustrlen(s), wchar_t); ustrcpy(r, s); } else { - r = mknew(wchar_t); + r = snew(wchar_t); *r = 0; } return r; @@ -98,7 +100,7 @@ char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) len = ustrlen(s); outlen = len + 10; - outbuf = mknewa(char, outlen); + outbuf = snewn(outlen, char); outpos = 0; outbuf[outpos] = '\0'; @@ -114,7 +116,7 @@ char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) } if (!ret) { outlen = outlen * 3 / 2; - outbuf = resize(outbuf, outlen); + outbuf = sresize(outbuf, outlen, char); } outpos += ret; outbuf[outpos] = '\0'; @@ -123,7 +125,7 @@ char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) * Clean up */ outlen = outpos + 32; - outbuf = resize(outbuf, outlen); + outbuf = sresize(outbuf, outlen, char); ret = charset_from_unicode(NULL, 0, outbuf + outpos, outlen - outpos + 1, charset, &state, NULL); @@ -155,15 +157,79 @@ wchar_t *ufroma_dup(char const *s, int charset) { len = strlen(s) + 1; do { - buf = resize(buf, len); + buf = sresize(buf, len, wchar_t); ustrfroma(s, buf, len, charset); len = (3 * len) / 2 + 1; /* this guarantees a strict increase */ } while (ustrlen(buf) >= len-1); - buf = resize(buf, ustrlen(buf)+1); + buf = sresize(buf, ustrlen(buf)+1, wchar_t); return buf; } +char *utoa_locale_dup(wchar_t const *s) +{ + /* + * This variant uses the C library locale. + */ + char *ret; + int len, outlen; + size_t siz; + + len = ustrlen(s); + + outlen = 1 + MB_CUR_MAX * len; + ret = snewn(outlen+1, char); + + siz = wcstombs(ret, s, outlen); + + if (siz) { + assert(siz <= (size_t)(outlen)); + ret[siz] = '\0'; + ret = sresize(ret, siz+1, char); + return ret; + } + + /* + * If that failed, try a different strategy (which we will also + * attempt in the total absence of wcstombs). Retrieve the + * locale's charset from nl_langinfo or equivalent, and use + * normal utoa_dup. + */ + return utoa_dup(s, charset_from_locale()); +} + +wchar_t *ufroma_locale_dup(char const *s) +{ + /* + * This variant uses the C library locale. + */ + wchar_t *ret; + int len, outlen; + size_t siz; + + len = strlen(s); + + outlen = 1 + 2*len; + ret = snewn(outlen+1, wchar_t); /* be conservative */ + + siz = mbstowcs(ret, s, outlen); + + if (siz) { + assert(siz <= (size_t)(outlen)); + ret[siz] = L'\0'; + ret = sresize(ret, siz+1, wchar_t); + return ret; + } + + /* + * If that failed, try a different strategy (which we will also + * attempt in the total absence of wcstombs). Retrieve the + * locale's charset from nl_langinfo or equivalent, and use + * normal ufroma_dup. + */ + return ufroma_dup(s, charset_from_locale()); +} + int ustrlen(wchar_t const *s) { int len = 0; while (*s++) len++; @@ -182,6 +248,15 @@ wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) { return ret; } +wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) { + wchar_t *ret = dest; + do { + *dest++ = *source; + if (*source) source++; + } while (n-- > 0); + return ret; +} + int ustrcmp(wchar_t *lhs, wchar_t *rhs) { if (!lhs && !rhs) return 0; if (!lhs) return -1; @@ -198,18 +273,24 @@ int ustrcmp(wchar_t *lhs, wchar_t *rhs) { wchar_t utolower(wchar_t c) { if (c == L'\0') return c; /* this property needed by ustricmp */ - /* FIXME: this doesn't even come close */ +#ifdef HAS_TOWLOWER + return towlower(c); +#else if (c >= 'A' && c <= 'Z') c += 'a'-'A'; return c; +#endif } int uisalpha(wchar_t c) { - /* FIXME: this doesn't even come close */ +#ifdef HAS_ISWALPHA + return iswalpha(c); +#else return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +#endif } -int ustricmp(wchar_t *lhs, wchar_t *rhs) { +int ustricmp(wchar_t const *lhs, wchar_t const *rhs) { wchar_t lc, rc; while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) lhs++, rhs++; @@ -221,6 +302,19 @@ int ustricmp(wchar_t *lhs, wchar_t *rhs) { return 1; } +int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) { + wchar_t lc = 0, rc = 0; + while (maxlen-- > 0 && + (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) + lhs++, rhs++; + if (lc < rc) + return -1; + else if (lc > rc) + return 1; + else + return 0; +} + wchar_t *ustrlow(wchar_t *s) { wchar_t *p = s; while (*p) { @@ -230,7 +324,7 @@ wchar_t *ustrlow(wchar_t *s) { return s; } -int utoi(wchar_t *s) { +int utoi(wchar_t const *s) { int sign = +1; int n; @@ -246,10 +340,18 @@ int utoi(wchar_t *s) { s++; } - return n; + return n * sign; +} + +double utof(wchar_t const *s) +{ + char *cs = utoa_dup(s, CS_ASCII); + double ret = atof(cs); + sfree(cs); + return ret; } -int utob(wchar_t *s) { +int utob(wchar_t const *s) { if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") || !ustricmp(s, L"true") || !ustricmp(s, L"t")) return TRUE; @@ -261,19 +363,9 @@ int uisdigit(wchar_t c) { } #define USTRFTIME_DELTA 128 -wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) { - void *blk = NULL; - wchar_t *wblk, *wp; - char *fmt, *text, *p; - size_t size = 0; - size_t len; - - /* - * FIXME: really we ought to copy non-% parts of the format - * ourselves, and only resort to strftime for % parts. Also we - * should use wcsftime if it's present. - */ - +static void ustrftime_internal(rdstring *rs, char formatchr, + const struct tm *timespec) +{ /* * strftime has the entertaining property that it returns 0 * _either_ on out-of-space _or_ on successful generation of @@ -281,38 +373,73 @@ wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) { * generate the empty string. Somebody throw a custard pie at * whoever was responsible for that. Please? */ - if (wfmt) { - len = ustrlen(wfmt); - fmt = mknewa(char, 2+len); - ustrtoa(wfmt, fmt+1, len+1, CS_ASCII); /* CS_FIXME? */ - fmt[0] = ' '; - } else - fmt = " %c"; - - while (1) { + +#ifdef HAS_WCSFTIME + wchar_t *buf = NULL; + wchar_t fmt[4]; + int size, ret; + + fmt[0] = L' '; + fmt[1] = L'%'; + /* Format chars are all ASCII, so conversion to Unicode is no problem */ + fmt[2] = formatchr; + fmt[3] = L'\0'; + + size = 0; + do { + size += USTRFTIME_DELTA; + buf = sresize(buf, size, wchar_t); + ret = (int) wcsftime(buf, size, fmt, timespec); + } while (ret == 0); + + rdadds(rs, buf+1); + sfree(buf); +#else + char *buf = NULL; + wchar_t *cvtbuf; + char fmt[4]; + int size, ret; + + fmt[0] = ' '; + fmt[1] = '%'; + fmt[2] = formatchr; + fmt[3] = '\0'; + + size = 0; + do { size += USTRFTIME_DELTA; - blk = resize((char *)blk, size); - len = strftime((char *)blk, size-1, fmt, timespec); - if (len > 0) - break; + buf = sresize(buf, size, char); + ret = (int) strftime(buf, size, fmt, timespec); + } while (ret == 0); + + cvtbuf = ufroma_locale_dup(buf+1); + rdadds(rs, cvtbuf); + sfree(cvtbuf); + sfree(buf); +#endif +} + +wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec) +{ + rdstring rs = { 0, 0, NULL }; + + if (!wfmt) + wfmt = L"%c"; + + while (*wfmt) { + if (wfmt[0] == L'%' && wfmt[1] == L'%') { + rdadd(&rs, L'%'); + wfmt += 2; + } else if (wfmt[0] == L'%' && wfmt[1]) { + ustrftime_internal(&rs, wfmt[1], timespec); + wfmt += 2; + } else { + rdadd(&rs, wfmt[0]); + wfmt++; + } } - /* Note: +1 for the terminating 0, -1 for the initial space in fmt */ - wblk = resize((wchar_t *)blk, len); - text = mknewa(char, len); - strftime(text, len, fmt+1, timespec); - /* - * We operate in the C locale, so this all ought to be kosher - * ASCII. If we ever move outside ASCII machines, we may need - * to make this more portable... - */ - for (wp = wblk, p = text; *p; p++, wp++) - *wp = *p; - *wp = 0; - if (wfmt) - sfree(fmt); - sfree(text); - return wblk; + return rdtrim(&rs); } /* @@ -334,3 +461,28 @@ int cvt_ok(int charset, const wchar_t *s) } return TRUE; } + +/* + * Wrapper around charset_from_localenc which accepts the charset + * name as a wide string (since that happens to be more useful). + * Also throws a Halibut error and falls back to CS_ASCII if the + * charset is unrecognised, meaning the rest of the program can + * rely on always getting a valid charset id back from this + * function. + */ +int charset_from_ustr(filepos *fpos, const wchar_t *name) +{ + char *csname; + int charset; + + csname = utoa_dup(name, CS_ASCII); + charset = charset_from_localenc(csname); + + if (charset == CS_NONE) { + charset = CS_ASCII; + error(err_charset, fpos, name); + } + + sfree(csname); + return charset; +}