[sgt/halibut] / ustring.c

/*
 * ustring.c: Unicode string routines
 */

#include <wchar.h>
#include <time.h>
#include "halibut.h"

wchar_t *ustrdup(wchar_t const *s) {
    wchar_t *r;
    if (s) {
	r = mknewa(wchar_t, 1+ustrlen(s));
	ustrcpy(r, s);
    } else {
	r = mknew(wchar_t);
	*r = 0;
    }
    return r;
}

static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
			      int charset, int careful) {
    int len, ret, err;
    charset_state state = CHARSET_INIT_STATE;

    if (!s) {
	*outbuf = '\0';
	return outbuf;
    }

    len = ustrlen(s);
    size--;			       /* leave room for terminating NUL */
    *outbuf = '\0';
    while (len > 0) {
	err = 0;
	ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
				   (careful ? &err : NULL));
	if (err)
	    return NULL;
	if (!ret)
	    return outbuf;
	size -= ret;
	outbuf += ret;
	*outbuf = '\0';
    }
    /*
     * Clean up
     */
    ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
    size -= ret;
    outbuf += ret;
    *outbuf = '\0';
    return outbuf;
}

char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
    return ustrtoa_internal(s, outbuf, size, charset, FALSE);
}

char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
    return ustrtoa_internal(s, outbuf, size, charset, TRUE);
}

wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
    int len, ret;
    charset_state state = CHARSET_INIT_STATE;

    if (!s) {
	*outbuf = L'\0';
	return outbuf;
    }

    len = strlen(s);
    size--;			       /* allow for terminating NUL */
    *outbuf = L'\0';
    while (len > 0) {
	ret = charset_to_unicode(&s, &len, outbuf, size,
				 charset, &state, NULL, 0);
	if (!ret)
	    return outbuf;
	outbuf += ret;
	size -= ret;
	*outbuf = L'\0';
    }
    return outbuf;
}

char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
{
    char *outbuf;
    int outpos, outlen, len, ret, err;
    charset_state state = CHARSET_INIT_STATE;

    if (!s) {
	return dupstr("");
    }

    len = ustrlen(s);

    outlen = len + 10;
    outbuf = mknewa(char, outlen);

    outpos = 0;
    outbuf[outpos] = '\0';

    while (len > 0) {
	err = 0;
	ret = charset_from_unicode(&s, &len,
				   outbuf + outpos, outlen - outpos - 1,
				   charset, &state, (careful ? &err : NULL));
	if (err) {
	    sfree(outbuf);
	    return NULL;
	}
	if (!ret) {
	    outlen = outlen * 3 / 2;
	    outbuf = resize(outbuf, outlen);
	}
	outpos += ret;
	outbuf[outpos] = '\0';
    }
    /*
     * Clean up
     */
    outlen = outpos + 32;
    outbuf = resize(outbuf, outlen);
    ret = charset_from_unicode(NULL, 0,
			       outbuf + outpos, outlen - outpos + 1,
			       charset, &state, NULL);
    outpos += ret;
    outbuf[outpos] = '\0';
    if (lenp)
	*lenp = outpos;
    return outbuf;
}

char *utoa_dup(wchar_t const *s, int charset)
{
    return utoa_internal_dup(s, charset, NULL, FALSE);
}

char *utoa_dup_len(wchar_t const *s, int charset, int *len)
{
    return utoa_internal_dup(s, charset, len, FALSE);
}

char *utoa_careful_dup(wchar_t const *s, int charset)
{
    return utoa_internal_dup(s, charset, NULL, TRUE);
}

wchar_t *ufroma_dup(char const *s, int charset) {
    int len;
    wchar_t *buf = NULL;

    len = strlen(s) + 1;
    do {
	buf = resize(buf, len);
	ustrfroma(s, buf, len, charset);
	len = (3 * len) / 2 + 1;       /* this guarantees a strict increase */
    } while (ustrlen(buf) >= len-1);

    buf = resize(buf, ustrlen(buf)+1);
    return buf;
}

int ustrlen(wchar_t const *s) {
    int len = 0;
    while (*s++) len++;
    return len;
}

wchar_t *uadv(wchar_t *s) {
    return s + 1 + ustrlen(s);
}

wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
    wchar_t *ret = dest;
    do {
	*dest++ = *source;
    } while (*source++);
    return ret;
}

int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
    if (!lhs && !rhs) return 0;
    if (!lhs) return -1;
    if (!rhs) return +1;
    while (*lhs && *rhs && *lhs==*rhs)
	lhs++, rhs++;
    if (*lhs < *rhs)
	return -1;
    else if (*lhs > *rhs)
	return 1;
    return 0;
}

wchar_t utolower(wchar_t c) {
    if (c == L'\0')
	return c;		       /* this property needed by ustricmp */
    /* FIXME: this doesn't even come close */
    if (c >= 'A' && c <= 'Z')
	c += 'a'-'A';
    return c;
}

int uisalpha(wchar_t c) {
    /* FIXME: this doesn't even come close */
    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}

int ustricmp(wchar_t *lhs, wchar_t *rhs) {
    wchar_t lc, rc;
    while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
	lhs++, rhs++;
    if (!lc && !rc)
	return 0;
    if (lc < rc)
	return -1;
    else
	return 1;
}

wchar_t *ustrlow(wchar_t *s) {
    wchar_t *p = s;
    while (*p) {
	*p = utolower(*p);
	p++;
    }
    return s;
}

int utoi(wchar_t *s) {
    int sign = +1;
    int n;

    if (*s == L'-') {
	s++;
	sign = -1;
    }

    n = 0;
    while (*s && *s >= L'0' && *s <= L'9') {
	n *= 10;
	n += (*s - '0');
	s++;
    }

    return n;
}

int utob(wchar_t *s) {
    if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
	!ustricmp(s, L"true") || !ustricmp(s, L"t"))
	return TRUE;
    return FALSE;
}

int uisdigit(wchar_t c) {
    return c >= L'0' && c <= L'9';
}

#define USTRFTIME_DELTA 128
wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) {
    void *blk = NULL;
    wchar_t *wblk, *wp;
    char *fmt, *text, *p;
    size_t size = 0;
    size_t len;

    /*
     * FIXME: really we ought to copy non-% parts of the format
     * ourselves, and only resort to strftime for % parts. Also we
     * should use wcsftime if it's present.
     */

    /*
     * strftime has the entertaining property that it returns 0
     * _either_ on out-of-space _or_ on successful generation of
     * the empty string. Hence we must ensure our format can never
     * generate the empty string. Somebody throw a custard pie at
     * whoever was responsible for that. Please?
     */
    if (wfmt) {
	len = ustrlen(wfmt);
	fmt = mknewa(char, 2+len);
	ustrtoa(wfmt, fmt+1, len+1, CS_ASCII);   /* CS_FIXME? */
	fmt[0] = ' ';
    } else
	fmt = " %c";

    while (1) {
	size += USTRFTIME_DELTA;
	blk = resize((char *)blk, size);
	len = strftime((char *)blk, size-1, fmt, timespec);
	if (len > 0)
	    break;
    }

    /* Note: +1 for the terminating 0, -1 for the initial space in fmt */
    wblk = resize((wchar_t *)blk, len);
    text = mknewa(char, len);
    strftime(text, len, fmt+1, timespec);
    /*
     * We operate in the C locale, so this all ought to be kosher
     * ASCII. If we ever move outside ASCII machines, we may need
     * to make this more portable...
     */
    for (wp = wblk, p = text; *p; p++, wp++)
	*wp = *p;
    *wp = 0;
    if (wfmt)
	sfree(fmt);
    sfree(text);
    return wblk;
}
Commit	Line	Data
d7482997	1	/*
	2	* ustring.c: Unicode string routines
	3	*/
	4
	5	#include <wchar.h>
	6	#include <time.h>
	7	#include "halibut.h"
	8
e4ea58f8	9	wchar_t ustrdup(wchar_t const s) {
d7482997	10	wchar_t *r;
	11	if (s) {
	12	r = mknewa(wchar_t, 1+ustrlen(s));
	13	ustrcpy(r, s);
	14	} else {
	15	r = mknew(wchar_t);
	16	*r = 0;
	17	}
	18	return r;
	19	}
	20
e4ea58f8	21	static char ustrtoa_internal(wchar_t const s, char *outbuf, int size,
	22	int charset, int careful) {
	23	int len, ret, err;
	24	charset_state state = CHARSET_INIT_STATE;
	25
d7482997	26	if (!s) {
	27	*outbuf = '\0';
	28	return outbuf;
	29	}
e4ea58f8	30
	31	len = ustrlen(s);
	32	size--; /* leave room for terminating NUL */
	33	*outbuf = '\0';
	34	while (len > 0) {
	35	err = 0;
	36	ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
	37	(careful ? &err : NULL));
	38	if (err)
	39	return NULL;
	40	if (!ret)
	41	return outbuf;
	42	size -= ret;
	43	outbuf += ret;
	44	*outbuf = '\0';
	45	}
	46	/*
	47	* Clean up
	48	*/
	49	ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
	50	size -= ret;
	51	outbuf += ret;
	52	*outbuf = '\0';
d7482997	53	return outbuf;
	54	}
	55
e4ea58f8	56	char ustrtoa(wchar_t const s, char *outbuf, int size, int charset) {
	57	return ustrtoa_internal(s, outbuf, size, charset, FALSE);
	58	}
	59
	60	char ustrtoa_careful(wchar_t const s, char *outbuf, int size, int charset) {
	61	return ustrtoa_internal(s, outbuf, size, charset, TRUE);
	62	}
	63
	64	wchar_t ustrfroma(char const s, wchar_t *outbuf, int size, int charset) {
	65	int len, ret;
	66	charset_state state = CHARSET_INIT_STATE;
	67
ba9c1487	68	if (!s) {
	69	*outbuf = L'\0';
	70	return outbuf;
	71	}
e4ea58f8	72
	73	len = strlen(s);
	74	size--; /* allow for terminating NUL */
	75	*outbuf = L'\0';
	76	while (len > 0) {
	77	ret = charset_to_unicode(&s, &len, outbuf, size,
	78	charset, &state, NULL, 0);
	79	if (!ret)
	80	return outbuf;
	81	outbuf += ret;
	82	size -= ret;
	83	*outbuf = L'\0';
	84	}
ba9c1487	85	return outbuf;
	86	}
	87
e4ea58f8	88	char utoa_internal_dup(wchar_t const s, int charset, int *lenp, int careful)
	89	{
	90	char *outbuf;
	91	int outpos, outlen, len, ret, err;
	92	charset_state state = CHARSET_INIT_STATE;
50d6b4bd	93
e4ea58f8	94	if (!s) {
	95	return dupstr("");
	96	}
50d6b4bd	97
e4ea58f8	98	len = ustrlen(s);
	99
	100	outlen = len + 10;
	101	outbuf = mknewa(char, outlen);
	102
	103	outpos = 0;
	104	outbuf[outpos] = '\0';
	105
	106	while (len > 0) {
	107	err = 0;
	108	ret = charset_from_unicode(&s, &len,
	109	outbuf + outpos, outlen - outpos - 1,
	110	charset, &state, (careful ? &err : NULL));
	111	if (err) {
	112	sfree(outbuf);
	113	return NULL;
	114	}
	115	if (!ret) {
	116	outlen = outlen * 3 / 2;
	117	outbuf = resize(outbuf, outlen);
	118	}
	119	outpos += ret;
	120	outbuf[outpos] = '\0';
	121	}
	122	/*
	123	* Clean up
	124	*/
	125	outlen = outpos + 32;
	126	outbuf = resize(outbuf, outlen);
	127	ret = charset_from_unicode(NULL, 0,
	128	outbuf + outpos, outlen - outpos + 1,
	129	charset, &state, NULL);
	130	outpos += ret;
	131	outbuf[outpos] = '\0';
	132	if (lenp)
	133	*lenp = outpos;
	134	return outbuf;
50d6b4bd	135	}
50d6b4bd	136
e4ea58f8	137	char utoa_dup(wchar_t const s, int charset)
	138	{
	139	return utoa_internal_dup(s, charset, NULL, FALSE);
	140	}
	141
	142	char utoa_dup_len(wchar_t const s, int charset, int *len)
	143	{
	144	return utoa_internal_dup(s, charset, len, FALSE);
	145	}
	146
	147	char utoa_careful_dup(wchar_t const s, int charset)
	148	{
	149	return utoa_internal_dup(s, charset, NULL, TRUE);
	150	}
	151
	152	wchar_t ufroma_dup(char const s, int charset) {
ba9c1487	153	int len;
	154	wchar_t *buf = NULL;
	155
	156	len = strlen(s) + 1;
	157	do {
	158	buf = resize(buf, len);
e4ea58f8	159	ustrfroma(s, buf, len, charset);
ba9c1487	160	len = (3 * len) / 2 + 1; /* this guarantees a strict increase */
	161	} while (ustrlen(buf) >= len-1);
	162
	163	buf = resize(buf, ustrlen(buf)+1);
	164	return buf;
	165	}
	166
5dd44dce	167	int ustrlen(wchar_t const *s) {
d7482997	168	int len = 0;
	169	while (*s++) len++;
	170	return len;
	171	}
	172
	173	wchar_t uadv(wchar_t s) {
	174	return s + 1 + ustrlen(s);
	175	}
	176
5dd44dce	177	wchar_t ustrcpy(wchar_t dest, wchar_t const *source) {
d7482997	178	wchar_t *ret = dest;
	179	do {
	180	dest++ = source;
	181	} while (*source++);
	182	return ret;
	183	}
	184
	185	int ustrcmp(wchar_t lhs, wchar_t rhs) {
	186	if (!lhs && !rhs) return 0;
	187	if (!lhs) return -1;
	188	if (!rhs) return +1;
	189	while (lhs && rhs && lhs==rhs)
	190	lhs++, rhs++;
	191	if (lhs < rhs)
	192	return -1;
	193	else if (lhs > rhs)
	194	return 1;
	195	return 0;
	196	}
	197
	198	wchar_t utolower(wchar_t c) {
	199	if (c == L'\0')
	200	return c; /* this property needed by ustricmp */
	201	/* FIXME: this doesn't even come close */
	202	if (c >= 'A' && c <= 'Z')
	203	c += 'a'-'A';
	204	return c;
	205	}
	206
831da32e	207	int uisalpha(wchar_t c) {
	208	/* FIXME: this doesn't even come close */
	209	return (c >= 'A' && c <= 'Z') \|\| (c >= 'a' && c <= 'z');
	210	}
	211
d7482997	212	int ustricmp(wchar_t lhs, wchar_t rhs) {
	213	wchar_t lc, rc;
	214	while ((lc = utolower(lhs)) == (rc = utolower(rhs)) && lc && rc)
	215	lhs++, rhs++;
	216	if (!lc && !rc)
	217	return 0;
	218	if (lc < rc)
	219	return -1;
	220	else
	221	return 1;
	222	}
	223
	224	wchar_t ustrlow(wchar_t s) {
	225	wchar_t *p = s;
	226	while (*p) {
	227	p = utolower(p);
	228	p++;
	229	}
	230	return s;
	231	}
	232
	233	int utoi(wchar_t *s) {
	234	int sign = +1;
	235	int n;
	236
	237	if (*s == L'-') {
	238	s++;
	239	sign = -1;
	240	}
	241
	242	n = 0;
	243	while (s && s >= L'0' && *s <= L'9') {
	244	n *= 10;
	245	n += (*s - '0');
	246	s++;
	247	}
	248
	249	return n;
	250	}
	251
	252	int utob(wchar_t *s) {
	253	if (!ustricmp(s, L"yes") \|\| !ustricmp(s, L"y") \|\|
	254	!ustricmp(s, L"true") \|\| !ustricmp(s, L"t"))
	255	return TRUE;
	256	return FALSE;
	257	}
	258
	259	int uisdigit(wchar_t c) {
	260	return c >= L'0' && c <= L'9';
	261	}
	262
	263	#define USTRFTIME_DELTA 128
	264	wchar_t ustrftime(wchar_t wfmt, struct tm *timespec) {
	265	void *blk = NULL;
	266	wchar_t wblk, wp;
	267	char fmt, text, *p;
	268	size_t size = 0;
	269	size_t len;
	270
	271	/*
e4ea58f8	272	* FIXME: really we ought to copy non-% parts of the format
	273	* ourselves, and only resort to strftime for % parts. Also we
	274	* should use wcsftime if it's present.
	275	*/
	276
	277	/*
d7482997	278	* strftime has the entertaining property that it returns 0
	279	* _either_ on out-of-space _or_ on successful generation of
	280	* the empty string. Hence we must ensure our format can never
	281	* generate the empty string. Somebody throw a custard pie at
	282	* whoever was responsible for that. Please?
	283	*/
	284	if (wfmt) {
	285	len = ustrlen(wfmt);
	286	fmt = mknewa(char, 2+len);
e4ea58f8	287	ustrtoa(wfmt, fmt+1, len+1, CS_ASCII); /* CS_FIXME? */
d7482997	288	fmt[0] = ' ';
	289	} else
	290	fmt = " %c";
	291
	292	while (1) {
	293	size += USTRFTIME_DELTA;
	294	blk = resize((char *)blk, size);
	295	len = strftime((char *)blk, size-1, fmt, timespec);
	296	if (len > 0)
	297	break;
	298	}
	299
	300	/* Note: +1 for the terminating 0, -1 for the initial space in fmt */
	301	wblk = resize((wchar_t *)blk, len);
	302	text = mknewa(char, len);
	303	strftime(text, len, fmt+1, timespec);
	304	/*
	305	* We operate in the C locale, so this all ought to be kosher
	306	* ASCII. If we ever move outside ASCII machines, we may need
	307	* to make this more portable...
	308	*/
	309	for (wp = wblk, p = text; *p; p++, wp++)
	310	wp = p;
	311	*wp = 0;
	312	if (wfmt)
	313	sfree(fmt);
	314	sfree(text);
	315	return wblk;
	316	}