X-Git-Url: https://git.distorted.org.uk/~mdw/sgt/halibut/blobdiff_plain/d7482997dd1ca71b70df43c15dd5956f435a1a7e..8f664e7e91c918cd13248f6b684580c4dd2cdb31:/ustring.c

diff --git a/ustring.c b/ustring.c
index 1573a19..3c5698c 100644
--- a/ustring.c
+++ b/ustring.c
@@ -3,37 +3,234 @@
  */
 
 #include <wchar.h>
+#include <stdlib.h>
+#include <assert.h>
 #include <time.h>
 #include "halibut.h"
 
-wchar_t *ustrdup(wchar_t *s) {
+wchar_t *ustrdup(wchar_t const *s) {
     wchar_t *r;
     if (s) {
-	r = mknewa(wchar_t, 1+ustrlen(s));
+	r = snewn(1+ustrlen(s), wchar_t);
 	ustrcpy(r, s);
     } else {
-	r = mknew(wchar_t);
+	r = snew(wchar_t);
 	*r = 0;
     }
     return r;
 }
 
-char *ustrtoa(wchar_t *s, char *outbuf, int size) {
-    char *p;
+static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
+			      int charset, int careful) {
+    int len, ret, err;
+    charset_state state = CHARSET_INIT_STATE;
+
     if (!s) {
 	*outbuf = '\0';
 	return outbuf;
     }
-    for (p = outbuf; *s && p < outbuf+size; p++,s++)
-	*p = *s;
-    if (p < outbuf+size)
-	*p = '\0';
-    else
-	outbuf[size-1] = '\0';
+
+    len = ustrlen(s);
+    size--;			       /* leave room for terminating NUL */
+    *outbuf = '\0';
+    while (len > 0) {
+	err = 0;
+	ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
+				   (careful ? &err : NULL));
+	if (err)
+	    return NULL;
+	if (!ret)
+	    return outbuf;
+	size -= ret;
+	outbuf += ret;
+	*outbuf = '\0';
+    }
+    /*
+     * Clean up
+     */
+    ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
+    size -= ret;
+    outbuf += ret;
+    *outbuf = '\0';
+    return outbuf;
+}
+
+char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
+    return ustrtoa_internal(s, outbuf, size, charset, FALSE);
+}
+
+char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
+    return ustrtoa_internal(s, outbuf, size, charset, TRUE);
+}
+
+wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
+    int len, ret;
+    charset_state state = CHARSET_INIT_STATE;
+
+    if (!s) {
+	*outbuf = L'\0';
+	return outbuf;
+    }
+
+    len = strlen(s);
+    size--;			       /* allow for terminating NUL */
+    *outbuf = L'\0';
+    while (len > 0) {
+	ret = charset_to_unicode(&s, &len, outbuf, size,
+				 charset, &state, NULL, 0);
+	if (!ret)
+	    return outbuf;
+	outbuf += ret;
+	size -= ret;
+	*outbuf = L'\0';
+    }
     return outbuf;
 }
 
-int ustrlen(wchar_t *s) {
+char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
+{
+    char *outbuf;
+    int outpos, outlen, len, ret, err;
+    charset_state state = CHARSET_INIT_STATE;
+
+    if (!s) {
+	return dupstr("");
+    }
+
+    len = ustrlen(s);
+
+    outlen = len + 10;
+    outbuf = snewn(outlen, char);
+
+    outpos = 0;
+    outbuf[outpos] = '\0';
+
+    while (len > 0) {
+	err = 0;
+	ret = charset_from_unicode(&s, &len,
+				   outbuf + outpos, outlen - outpos - 1,
+				   charset, &state, (careful ? &err : NULL));
+	if (err) {
+	    sfree(outbuf);
+	    return NULL;
+	}
+	if (!ret) {
+	    outlen = outlen * 3 / 2;
+	    outbuf = sresize(outbuf, outlen, char);
+	}
+	outpos += ret;
+	outbuf[outpos] = '\0';
+    }
+    /*
+     * Clean up
+     */
+    outlen = outpos + 32;
+    outbuf = sresize(outbuf, outlen, char);
+    ret = charset_from_unicode(NULL, 0,
+			       outbuf + outpos, outlen - outpos + 1,
+			       charset, &state, NULL);
+    outpos += ret;
+    outbuf[outpos] = '\0';
+    if (lenp)
+	*lenp = outpos;
+    return outbuf;
+}
+
+char *utoa_dup(wchar_t const *s, int charset)
+{
+    return utoa_internal_dup(s, charset, NULL, FALSE);
+}
+
+char *utoa_dup_len(wchar_t const *s, int charset, int *len)
+{
+    return utoa_internal_dup(s, charset, len, FALSE);
+}
+
+char *utoa_careful_dup(wchar_t const *s, int charset)
+{
+    return utoa_internal_dup(s, charset, NULL, TRUE);
+}
+
+wchar_t *ufroma_dup(char const *s, int charset) {
+    int len;
+    wchar_t *buf = NULL;
+
+    len = strlen(s) + 1;
+    do {
+	buf = sresize(buf, len, wchar_t);
+	ustrfroma(s, buf, len, charset);
+	len = (3 * len) / 2 + 1;       /* this guarantees a strict increase */
+    } while (ustrlen(buf) >= len-1);
+
+    buf = sresize(buf, ustrlen(buf)+1, wchar_t);
+    return buf;
+}
+
+char *utoa_locale_dup(wchar_t const *s)
+{
+    /*
+     * This variant uses the C library locale.
+     */
+    char *ret;
+    int len, outlen;
+    size_t siz;
+
+    len = ustrlen(s);
+
+    outlen = 1 + MB_CUR_MAX * len;
+    ret = snewn(outlen+1, char);
+
+    siz = wcstombs(ret, s, outlen);
+
+    if (siz) {
+	assert(siz <= (size_t)(outlen));
+	ret[siz] = '\0';
+	ret = sresize(ret, siz+1, char);
+	return ret;
+    }
+
+    /*
+     * If that failed, try a different strategy (which we will also
+     * attempt in the total absence of wcstombs). Retrieve the
+     * locale's charset from nl_langinfo or equivalent, and use
+     * normal utoa_dup.
+     */
+    return utoa_dup(s, charset_from_locale());
+}
+
+wchar_t *ufroma_locale_dup(char const *s)
+{
+    /*
+     * This variant uses the C library locale.
+     */
+    wchar_t *ret;
+    int len, outlen;
+    size_t siz;
+
+    len = strlen(s);
+
+    outlen = 1 + 2*len;
+    ret = snewn(outlen+1, wchar_t);  /* be conservative */
+
+    siz = mbstowcs(ret, s, outlen);
+
+    if (siz) {
+	assert(siz <= (size_t)(outlen));
+	ret[siz] = L'\0';
+	ret = sresize(ret, siz+1, wchar_t);
+	return ret;
+    }
+
+    /*
+     * If that failed, try a different strategy (which we will also
+     * attempt in the total absence of wcstombs). Retrieve the
+     * locale's charset from nl_langinfo or equivalent, and use
+     * normal ufroma_dup.
+     */
+    return ufroma_dup(s, charset_from_locale());
+}
+
+int ustrlen(wchar_t const *s) {
     int len = 0;
     while (*s++) len++;
     return len;
@@ -43,7 +240,7 @@ wchar_t *uadv(wchar_t *s) {
     return s + 1 + ustrlen(s);
 }
 
-wchar_t *ustrcpy(wchar_t *dest, wchar_t *source) {
+wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
     wchar_t *ret = dest;
     do {
 	*dest++ = *source;
@@ -51,6 +248,15 @@ wchar_t *ustrcpy(wchar_t *dest, wchar_t *source) {
     return ret;
 }
 
+wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) {
+    wchar_t *ret = dest;
+    do {
+	*dest++ = *source;
+	if (*source) source++;
+    } while (n-- > 0);
+    return ret;
+}
+
 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
     if (!lhs && !rhs) return 0;
     if (!lhs) return -1;
@@ -67,13 +273,24 @@ int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
 wchar_t utolower(wchar_t c) {
     if (c == L'\0')
 	return c;		       /* this property needed by ustricmp */
-    /* FIXME: this doesn't even come close */
+#ifdef HAS_TOWLOWER
+    return towlower(c);
+#else
     if (c >= 'A' && c <= 'Z')
 	c += 'a'-'A';
     return c;
+#endif
 }
 
-int ustricmp(wchar_t *lhs, wchar_t *rhs) {
+int uisalpha(wchar_t c) {
+#ifdef HAS_ISWALPHA
+    return iswalpha(c);
+#else
+    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+#endif
+}
+
+int ustricmp(wchar_t const *lhs, wchar_t const *rhs) {
     wchar_t lc, rc;
     while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
 	lhs++, rhs++;
@@ -85,6 +302,19 @@ int ustricmp(wchar_t *lhs, wchar_t *rhs) {
 	return 1;
 }
 
+int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) {
+    wchar_t lc = 0, rc = 0;
+    while (maxlen-- > 0 &&
+	   (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
+	lhs++, rhs++;
+    if (lc < rc)
+	return -1;
+    else if (lc > rc)
+	return 1;
+    else
+	return 0;
+}
+
 wchar_t *ustrlow(wchar_t *s) {
     wchar_t *p = s;
     while (*p) {
@@ -94,7 +324,7 @@ wchar_t *ustrlow(wchar_t *s) {
     return s;
 }
 
-int utoi(wchar_t *s) {
+int utoi(wchar_t const *s) {
     int sign = +1;
     int n;
 
@@ -110,10 +340,18 @@ int utoi(wchar_t *s) {
 	s++;
     }
 
-    return n;
+    return n * sign;
 }
 
-int utob(wchar_t *s) {
+double utof(wchar_t const *s)
+{
+    char *cs = utoa_dup(s, CS_ASCII);
+    double ret = atof(cs);
+    sfree(cs);
+    return ret;
+}
+
+int utob(wchar_t const *s) {
     if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
 	!ustricmp(s, L"true") || !ustricmp(s, L"t"))
 	return TRUE;
@@ -125,13 +363,9 @@ int uisdigit(wchar_t c) {
 }
 
 #define USTRFTIME_DELTA 128
-wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) {
-    void *blk = NULL;
-    wchar_t *wblk, *wp;
-    char *fmt, *text, *p;
-    size_t size = 0;
-    size_t len;
-
+static void ustrftime_internal(rdstring *rs, char formatchr,
+			       const struct tm *timespec)
+{
     /*
      * strftime has the entertaining property that it returns 0
      * _either_ on out-of-space _or_ on successful generation of
@@ -139,36 +373,116 @@ wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) {
      * generate the empty string. Somebody throw a custard pie at
      * whoever was responsible for that. Please?
      */
-    if (wfmt) {
-	len = ustrlen(wfmt);
-	fmt = mknewa(char, 2+len);
-	ustrtoa(wfmt, fmt+1, len+1);
-	fmt[0] = ' ';
-    } else
-	fmt = " %c";
-
-    while (1) {
+
+#ifdef HAS_WCSFTIME
+    wchar_t *buf = NULL;
+    wchar_t fmt[4];
+    int size, ret;
+
+    fmt[0] = L' ';
+    fmt[1] = L'%';
+    /* Format chars are all ASCII, so conversion to Unicode is no problem */
+    fmt[2] = formatchr;
+    fmt[3] = L'\0';
+
+    size = 0;
+    do {
+	size += USTRFTIME_DELTA;
+	buf = sresize(buf, size, wchar_t);
+	ret = (int) wcsftime(buf, size, fmt, timespec);
+    } while (ret == 0);
+
+    rdadds(rs, buf+1);
+    sfree(buf);
+#else
+    char *buf = NULL;
+    wchar_t *cvtbuf;
+    char fmt[4];
+    int size, ret;
+
+    fmt[0] = ' ';
+    fmt[1] = '%';
+    fmt[2] = formatchr;
+    fmt[3] = '\0';
+
+    size = 0;
+    do {
 	size += USTRFTIME_DELTA;
-	blk = resize((char *)blk, size);
-	len = strftime((char *)blk, size-1, fmt, timespec);
-	if (len > 0)
-	    break;
+	buf = sresize(buf, size, char);
+	ret = (int) strftime(buf, size, fmt, timespec);
+    } while (ret == 0);
+
+    cvtbuf = ufroma_locale_dup(buf+1);
+    rdadds(rs, cvtbuf);
+    sfree(cvtbuf);
+    sfree(buf);
+#endif
+}
+
+wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec)
+{
+    rdstring rs = { 0, 0, NULL };
+
+    if (!wfmt)
+	wfmt = L"%c";
+
+    while (*wfmt) {
+	if (wfmt[0] == L'%' && wfmt[1] == L'%') {
+	    rdadd(&rs, L'%');
+	    wfmt += 2;
+	} else if (wfmt[0] == L'%' && wfmt[1]) {
+	    ustrftime_internal(&rs, wfmt[1], timespec);
+	    wfmt += 2;
+	} else {
+	    rdadd(&rs, wfmt[0]);
+	    wfmt++;
+	}
     }
 
-    /* Note: +1 for the terminating 0, -1 for the initial space in fmt */
-    wblk = resize((wchar_t *)blk, len);
-    text = mknewa(char, len);
-    strftime(text, len, fmt+1, timespec);
-    /*
-     * We operate in the C locale, so this all ought to be kosher
-     * ASCII. If we ever move outside ASCII machines, we may need
-     * to make this more portable...
-     */
-    for (wp = wblk, p = text; *p; p++, wp++)
-	*wp = *p;
-    *wp = 0;
-    if (wfmt)
-	sfree(fmt);
-    sfree(text);
-    return wblk;
+    return rdtrim(&rs);
+}
+
+/*
+ * Determine whether a Unicode string can be translated into a
+ * given charset without any missing characters.
+ */
+int cvt_ok(int charset, const wchar_t *s)
+{
+    char buf[256];
+    charset_state state = CHARSET_INIT_STATE;
+    int err, len = ustrlen(s);
+
+    err = 0;
+    while (len > 0) {
+	(void)charset_from_unicode(&s, &len, buf, lenof(buf),
+				   charset, &state, &err);
+	if (err)
+	    return FALSE;
+    }
+    return TRUE;
+}
+
+/*
+ * Wrapper around charset_from_localenc which accepts the charset
+ * name as a wide string (since that happens to be more useful).
+ * Also throws a Halibut error and falls back to CS_ASCII if the
+ * charset is unrecognised, meaning the rest of the program can
+ * rely on always getting a valid charset id back from this
+ * function.
+ */
+int charset_from_ustr(filepos *fpos, const wchar_t *name)
+{
+    char *csname;
+    int charset;
+
+    csname = utoa_dup(name, CS_ASCII);
+    charset = charset_from_localenc(csname);
+
+    if (charset == CS_NONE) {
+	charset = CS_ASCII;
+	error(err_charset, fpos, name);
+    }
+
+    sfree(csname);
+    return charset;
 }