mdw@git.distorted.org.uk Git - sgt/halibut/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* ustring.c: Unicode string routines
	3	*/
	4
	5	#include <wchar.h>
	6	#include <stdlib.h>
	7	#include <assert.h>
	8	#include <time.h>
	9	#include "halibut.h"
	10
	11	wchar_t ustrdup(wchar_t const s) {
	12	wchar_t *r;
	13	if (s) {
	14	r = snewn(1+ustrlen(s), wchar_t);
	15	ustrcpy(r, s);
	16	} else {
	17	r = snew(wchar_t);
	18	*r = 0;
	19	}
	20	return r;
	21	}
	22
	23	static char ustrtoa_internal(wchar_t const s, char *outbuf, int size,
	24	int charset, int careful) {
	25	int len, ret, err;
	26	charset_state state = CHARSET_INIT_STATE;
	27
	28	if (!s) {
	29	*outbuf = '\0';
	30	return outbuf;
	31	}
	32
	33	len = ustrlen(s);
	34	size--; /* leave room for terminating NUL */
	35	*outbuf = '\0';
	36	while (len > 0) {
	37	err = 0;
	38	ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
	39	(careful ? &err : NULL));
	40	if (err)
	41	return NULL;
	42	if (!ret)
	43	return outbuf;
	44	size -= ret;
	45	outbuf += ret;
	46	*outbuf = '\0';
	47	}
	48	/*
	49	* Clean up
	50	*/
	51	ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
	52	size -= ret;
	53	outbuf += ret;
	54	*outbuf = '\0';
	55	return outbuf;
	56	}
	57
	58	char ustrtoa(wchar_t const s, char *outbuf, int size, int charset) {
	59	return ustrtoa_internal(s, outbuf, size, charset, FALSE);
	60	}
	61
	62	char ustrtoa_careful(wchar_t const s, char *outbuf, int size, int charset) {
	63	return ustrtoa_internal(s, outbuf, size, charset, TRUE);
	64	}
	65
	66	wchar_t ustrfroma(char const s, wchar_t *outbuf, int size, int charset) {
	67	int len, ret;
	68	charset_state state = CHARSET_INIT_STATE;
	69
	70	if (!s) {
	71	*outbuf = L'\0';
	72	return outbuf;
	73	}
	74
	75	len = strlen(s);
	76	size--; /* allow for terminating NUL */
	77	*outbuf = L'\0';
	78	while (len > 0) {
	79	ret = charset_to_unicode(&s, &len, outbuf, size,
	80	charset, &state, NULL, 0);
	81	if (!ret)
	82	return outbuf;
	83	outbuf += ret;
	84	size -= ret;
	85	*outbuf = L'\0';
	86	}
	87	return outbuf;
	88	}
	89
	90	char utoa_internal_dup(wchar_t const s, int charset, int *lenp, int careful)
	91	{
	92	char *outbuf;
	93	int outpos, outlen, len, ret, err;
	94	charset_state state = CHARSET_INIT_STATE;
	95
	96	if (!s) {
	97	return dupstr("");
	98	}
	99
	100	len = ustrlen(s);
	101
	102	outlen = len + 10;
	103	outbuf = snewn(outlen, char);
	104
	105	outpos = 0;
	106	outbuf[outpos] = '\0';
	107
	108	while (len > 0) {
	109	err = 0;
	110	ret = charset_from_unicode(&s, &len,
	111	outbuf + outpos, outlen - outpos - 1,
	112	charset, &state, (careful ? &err : NULL));
	113	if (err) {
	114	sfree(outbuf);
	115	return NULL;
	116	}
	117	if (!ret) {
	118	outlen = outlen * 3 / 2;
	119	outbuf = sresize(outbuf, outlen, char);
	120	}
	121	outpos += ret;
	122	outbuf[outpos] = '\0';
	123	}
	124	/*
	125	* Clean up
	126	*/
	127	outlen = outpos + 32;
	128	outbuf = sresize(outbuf, outlen, char);
	129	ret = charset_from_unicode(NULL, 0,
	130	outbuf + outpos, outlen - outpos + 1,
	131	charset, &state, NULL);
	132	outpos += ret;
	133	outbuf[outpos] = '\0';
	134	if (lenp)
	135	*lenp = outpos;
	136	return outbuf;
	137	}
	138
	139	char utoa_dup(wchar_t const s, int charset)
	140	{
	141	return utoa_internal_dup(s, charset, NULL, FALSE);
	142	}
	143
	144	char utoa_dup_len(wchar_t const s, int charset, int *len)
	145	{
	146	return utoa_internal_dup(s, charset, len, FALSE);
	147	}
	148
	149	char utoa_careful_dup(wchar_t const s, int charset)
	150	{
	151	return utoa_internal_dup(s, charset, NULL, TRUE);
	152	}
	153
	154	wchar_t ufroma_dup(char const s, int charset) {
	155	int len;
	156	wchar_t *buf = NULL;
	157
	158	len = strlen(s) + 1;
	159	do {
	160	buf = sresize(buf, len, wchar_t);
	161	ustrfroma(s, buf, len, charset);
	162	len = (3 * len) / 2 + 1; /* this guarantees a strict increase */
	163	} while (ustrlen(buf) >= len-1);
	164
	165	buf = sresize(buf, ustrlen(buf)+1, wchar_t);
	166	return buf;
	167	}
	168
	169	char utoa_locale_dup(wchar_t const s)
	170	{
	171	/*
	172	* This variant uses the C library locale.
	173	*/
	174	char *ret;
	175	int len;
	176	size_t siz;
	177
	178	len = ustrlen(s);
	179
	180	ret = snewn(1 + MB_CUR_MAX * len, char);
	181
	182	siz = wcstombs(ret, s, len);
	183
	184	if (siz) {
	185	assert(siz <= MB_CUR_MAX * len);
	186	ret[siz] = '\0';
	187	ret = sresize(ret, siz+1, char);
	188	return ret;
	189	}
	190
	191	/*
	192	* If that failed, try a different strategy (which we will also
	193	* attempt in the total absence of wcstombs). Retrieve the
	194	* locale's charset from nl_langinfo or equivalent, and use
	195	* normal utoa_dup.
	196	*/
	197	return utoa_dup(s, charset_from_locale());
	198	}
	199
	200	wchar_t ufroma_locale_dup(char const s)
	201	{
	202	/*
	203	* This variant uses the C library locale.
	204	*/
	205	wchar_t *ret;
	206	int len;
	207	size_t siz;
	208
	209	len = strlen(s);
	210
	211	ret = snewn(1 + 2len, wchar_t); / be conservative */
	212
	213	siz = mbstowcs(ret, s, len);
	214
	215	if (siz) {
	216	assert(siz <= (size_t)(2 * len));
	217	ret[siz] = L'\0';
	218	ret = sresize(ret, siz+1, wchar_t);
	219	return ret;
	220	}
	221
	222	/*
	223	* If that failed, try a different strategy (which we will also
	224	* attempt in the total absence of wcstombs). Retrieve the
	225	* locale's charset from nl_langinfo or equivalent, and use
	226	* normal ufroma_dup.
	227	*/
	228	return ufroma_dup(s, charset_from_locale());
	229	}
	230
	231	int ustrlen(wchar_t const *s) {
	232	int len = 0;
	233	while (*s++) len++;
	234	return len;
	235	}
	236
	237	wchar_t uadv(wchar_t s) {
	238	return s + 1 + ustrlen(s);
	239	}
	240
	241	wchar_t ustrcpy(wchar_t dest, wchar_t const *source) {
	242	wchar_t *ret = dest;
	243	do {
	244	dest++ = source;
	245	} while (*source++);
	246	return ret;
	247	}
	248
	249	wchar_t ustrncpy(wchar_t dest, wchar_t const *source, int n) {
	250	wchar_t *ret = dest;
	251	do {
	252	dest++ = source;
	253	if (*source) source++;
	254	} while (n-- > 0);
	255	return ret;
	256	}
	257
	258	int ustrcmp(wchar_t lhs, wchar_t rhs) {
	259	if (!lhs && !rhs) return 0;
	260	if (!lhs) return -1;
	261	if (!rhs) return +1;
	262	while (lhs && rhs && lhs==rhs)
	263	lhs++, rhs++;
	264	if (lhs < rhs)
	265	return -1;
	266	else if (lhs > rhs)
	267	return 1;
	268	return 0;
	269	}
	270
	271	wchar_t utolower(wchar_t c) {
	272	if (c == L'\0')
	273	return c; /* this property needed by ustricmp */
	274	#ifdef HAS_TOWLOWER
	275	return towlower(c);
	276	#else
	277	if (c >= 'A' && c <= 'Z')
	278	c += 'a'-'A';
	279	return c;
	280	#endif
	281	}
	282
	283	int uisalpha(wchar_t c) {
	284	#ifdef HAS_ISWALPHA
	285	return iswalpha(c);
	286	#else
	287	return (c >= 'A' && c <= 'Z') \|\| (c >= 'a' && c <= 'z');
	288	#endif
	289	}
	290
	291	int ustricmp(wchar_t const lhs, wchar_t const rhs) {
	292	wchar_t lc, rc;
	293	while ((lc = utolower(lhs)) == (rc = utolower(rhs)) && lc && rc)
	294	lhs++, rhs++;
	295	if (!lc && !rc)
	296	return 0;
	297	if (lc < rc)
	298	return -1;
	299	else
	300	return 1;
	301	}
	302
	303	int ustrnicmp(wchar_t const lhs, wchar_t const rhs, int maxlen) {
	304	wchar_t lc = 0, rc = 0;
	305	while (maxlen-- > 0 &&
	306	(lc = utolower(lhs)) == (rc = utolower(rhs)) && lc && rc)
	307	lhs++, rhs++;
	308	if (lc < rc)
	309	return -1;
	310	else if (lc > rc)
	311	return 1;
	312	else
	313	return 0;
	314	}
	315
	316	wchar_t ustrlow(wchar_t s) {
	317	wchar_t *p = s;
	318	while (*p) {
	319	p = utolower(p);
	320	p++;
	321	}
	322	return s;
	323	}
	324
	325	int utoi(wchar_t const *s) {
	326	int sign = +1;
	327	int n;
	328
	329	if (*s == L'-') {
	330	s++;
	331	sign = -1;
	332	}
	333
	334	n = 0;
	335	while (s && s >= L'0' && *s <= L'9') {
	336	n *= 10;
	337	n += (*s - '0');
	338	s++;
	339	}
	340
	341	return n;
	342	}
	343
	344	double utof(wchar_t const *s)
	345	{
	346	char *cs = utoa_dup(s, CS_ASCII);
	347	double ret = atof(cs);
	348	sfree(cs);
	349	return ret;
	350	}
	351
	352	int utob(wchar_t const *s) {
	353	if (!ustricmp(s, L"yes") \|\| !ustricmp(s, L"y") \|\|
	354	!ustricmp(s, L"true") \|\| !ustricmp(s, L"t"))
	355	return TRUE;
	356	return FALSE;
	357	}
	358
	359	int uisdigit(wchar_t c) {
	360	return c >= L'0' && c <= L'9';
	361	}
	362
	363	#define USTRFTIME_DELTA 128
	364	static void ustrftime_internal(rdstring *rs, char formatchr,
	365	const struct tm *timespec)
	366	{
	367	/*
	368	* strftime has the entertaining property that it returns 0
	369	* _either_ on out-of-space _or_ on successful generation of
	370	* the empty string. Hence we must ensure our format can never
	371	* generate the empty string. Somebody throw a custard pie at
	372	* whoever was responsible for that. Please?
	373	*/
	374
	375	#ifdef HAS_WCSFTIME
	376	wchar_t *buf = NULL;
	377	wchar_t fmt[4];
	378	int size, ret;
	379
	380	fmt[0] = L' ';
	381	fmt[1] = L'%';
	382	/* Format chars are all ASCII, so conversion to Unicode is no problem */
	383	fmt[2] = formatchr;
	384	fmt[3] = L'\0';
	385
	386	size = 0;
	387	do {
	388	size += USTRFTIME_DELTA;
	389	buf = sresize(buf, size, wchar_t);
	390	ret = (int) wcsftime(buf, size, fmt, timespec);
	391	} while (ret == 0);
	392
	393	rdadds(rs, buf+1);
	394	sfree(buf);
	395	#else
	396	char *buf = NULL;
	397	wchar_t *cvtbuf;
	398	char fmt[4];
	399	int size, ret;
	400
	401	fmt[0] = ' ';
	402	fmt[1] = '%';
	403	fmt[2] = formatchr;
	404	fmt[3] = '\0';
	405
	406	size = 0;
	407	do {
	408	size += USTRFTIME_DELTA;
	409	buf = sresize(buf, size, char);
	410	ret = (int) strftime(buf, size, fmt, timespec);
	411	} while (ret == 0);
	412
	413	cvtbuf = ufroma_locale_dup(buf+1);
	414	rdadds(rs, cvtbuf);
	415	sfree(cvtbuf);
	416	sfree(buf);
	417	#endif
	418	}
	419
	420	wchar_t ustrftime(const wchar_t wfmt, const struct tm *timespec)
	421	{
	422	rdstring rs = { 0, 0, NULL };
	423
	424	if (!wfmt)
	425	wfmt = L"%c";
	426
	427	while (*wfmt) {
	428	if (wfmt[0] == L'%' && wfmt[1] == L'%') {
	429	rdadd(&rs, L'%');
	430	wfmt += 2;
	431	} else if (wfmt[0] == L'%' && wfmt[1]) {
	432	ustrftime_internal(&rs, wfmt[1], timespec);
	433	wfmt += 2;
	434	} else {
	435	rdadd(&rs, wfmt[0]);
	436	wfmt++;
	437	}
	438	}
	439
	440	return rdtrim(&rs);
	441	}
	442
	443	/*
	444	* Determine whether a Unicode string can be translated into a
	445	* given charset without any missing characters.
	446	*/
	447	int cvt_ok(int charset, const wchar_t *s)
	448	{
	449	char buf[256];
	450	charset_state state = CHARSET_INIT_STATE;
	451	int err, len = ustrlen(s);
	452
	453	err = 0;
	454	while (len > 0) {
	455	(void)charset_from_unicode(&s, &len, buf, lenof(buf),
	456	charset, &state, &err);
	457	if (err)
	458	return FALSE;
	459	}
	460	return TRUE;
	461	}