2 * ustring.c: Unicode string routines
11 wchar_t *ustrdup(wchar_t const *s
) {
14 r
= snewn(1+ustrlen(s
), wchar_t);
23 static char *ustrtoa_internal(wchar_t const *s
, char *outbuf
, int size
,
24 int charset
, int careful
) {
26 charset_state state
= CHARSET_INIT_STATE
;
34 size
--; /* leave room for terminating NUL */
38 ret
= charset_from_unicode(&s
, &len
, outbuf
, size
, charset
, &state
,
39 (careful ?
&err
: NULL
));
51 ret
= charset_from_unicode(NULL
, 0, outbuf
, size
, charset
, &state
, NULL
);
58 char *ustrtoa(wchar_t const *s
, char *outbuf
, int size
, int charset
) {
59 return ustrtoa_internal(s
, outbuf
, size
, charset
, FALSE
);
62 char *ustrtoa_careful(wchar_t const *s
, char *outbuf
, int size
, int charset
) {
63 return ustrtoa_internal(s
, outbuf
, size
, charset
, TRUE
);
66 wchar_t *ustrfroma(char const *s
, wchar_t *outbuf
, int size
, int charset
) {
68 charset_state state
= CHARSET_INIT_STATE
;
76 size
--; /* allow for terminating NUL */
79 ret
= charset_to_unicode(&s
, &len
, outbuf
, size
,
80 charset
, &state
, NULL
, 0);
90 char *utoa_internal_dup(wchar_t const *s
, int charset
, int *lenp
, int careful
)
93 int outpos
, outlen
, len
, ret
, err
;
94 charset_state state
= CHARSET_INIT_STATE
;
103 outbuf
= snewn(outlen
, char);
106 outbuf
[outpos
] = '\0';
110 ret
= charset_from_unicode(&s
, &len
,
111 outbuf
+ outpos
, outlen
- outpos
- 1,
112 charset
, &state
, (careful ?
&err
: NULL
));
118 outlen
= outlen
* 3 / 2;
119 outbuf
= sresize(outbuf
, outlen
, char);
122 outbuf
[outpos
] = '\0';
127 outlen
= outpos
+ 32;
128 outbuf
= sresize(outbuf
, outlen
, char);
129 ret
= charset_from_unicode(NULL
, 0,
130 outbuf
+ outpos
, outlen
- outpos
+ 1,
131 charset
, &state
, NULL
);
133 outbuf
[outpos
] = '\0';
139 char *utoa_dup(wchar_t const *s
, int charset
)
141 return utoa_internal_dup(s
, charset
, NULL
, FALSE
);
144 char *utoa_dup_len(wchar_t const *s
, int charset
, int *len
)
146 return utoa_internal_dup(s
, charset
, len
, FALSE
);
149 char *utoa_careful_dup(wchar_t const *s
, int charset
)
151 return utoa_internal_dup(s
, charset
, NULL
, TRUE
);
154 wchar_t *ufroma_dup(char const *s
, int charset
) {
160 buf
= sresize(buf
, len
, wchar_t);
161 ustrfroma(s
, buf
, len
, charset
);
162 len
= (3 * len
) / 2 + 1; /* this guarantees a strict increase */
163 } while (ustrlen(buf
) >= len
-1);
165 buf
= sresize(buf
, ustrlen(buf
)+1, wchar_t);
169 char *utoa_locale_dup(wchar_t const *s
)
172 * This variant uses the C library locale.
180 ret
= snewn(1 + MB_CUR_MAX
* len
, char);
182 siz
= wcstombs(ret
, s
, len
);
185 assert(siz
<= MB_CUR_MAX
* len
);
187 ret
= sresize(ret
, siz
+1, char);
192 * If that failed, try a different strategy (which we will also
193 * attempt in the total absence of wcstombs). Retrieve the
194 * locale's charset from nl_langinfo or equivalent, and use
197 return utoa_dup(s
, charset_from_locale());
200 wchar_t *ufroma_locale_dup(char const *s
)
203 * This variant uses the C library locale.
211 ret
= snewn(1 + 2*len
, wchar_t); /* be conservative */
213 siz
= mbstowcs(ret
, s
, len
);
216 assert(siz
<= (size_t)(2 * len
));
218 ret
= sresize(ret
, siz
+1, wchar_t);
223 * If that failed, try a different strategy (which we will also
224 * attempt in the total absence of wcstombs). Retrieve the
225 * locale's charset from nl_langinfo or equivalent, and use
228 return ufroma_dup(s
, charset_from_locale());
231 int ustrlen(wchar_t const *s
) {
237 wchar_t *uadv(wchar_t *s
) {
238 return s
+ 1 + ustrlen(s
);
241 wchar_t *ustrcpy(wchar_t *dest
, wchar_t const *source
) {
249 wchar_t *ustrncpy(wchar_t *dest
, wchar_t const *source
, int n
) {
253 if (*source
) source
++;
258 int ustrcmp(wchar_t *lhs
, wchar_t *rhs
) {
259 if (!lhs
&& !rhs
) return 0;
262 while (*lhs
&& *rhs
&& *lhs
==*rhs
)
266 else if (*lhs
> *rhs
)
271 wchar_t utolower(wchar_t c
) {
273 return c
; /* this property needed by ustricmp */
277 if (c
>= 'A' && c
<= 'Z')
283 int uisalpha(wchar_t c
) {
287 return (c
>= 'A' && c
<= 'Z') || (c
>= 'a' && c
<= 'z');
291 int ustricmp(wchar_t const *lhs
, wchar_t const *rhs
) {
293 while ((lc
= utolower(*lhs
)) == (rc
= utolower(*rhs
)) && lc
&& rc
)
303 int ustrnicmp(wchar_t const *lhs
, wchar_t const *rhs
, int maxlen
) {
304 wchar_t lc
= 0, rc
= 0;
305 while (maxlen
-- > 0 &&
306 (lc
= utolower(*lhs
)) == (rc
= utolower(*rhs
)) && lc
&& rc
)
316 wchar_t *ustrlow(wchar_t *s
) {
325 int utoi(wchar_t const *s
) {
335 while (*s
&& *s
>= L
'0' && *s
<= L
'9') {
344 double utof(wchar_t const *s
)
346 char *cs
= utoa_dup(s
, CS_ASCII
);
347 double ret
= atof(cs
);
352 int utob(wchar_t const *s
) {
353 if (!ustricmp(s
, L
"yes") || !ustricmp(s
, L
"y") ||
354 !ustricmp(s
, L
"true") || !ustricmp(s
, L
"t"))
359 int uisdigit(wchar_t c
) {
360 return c
>= L
'0' && c
<= L
'9';
363 #define USTRFTIME_DELTA 128
364 static void ustrftime_internal(rdstring
*rs
, char formatchr
,
365 const struct tm
*timespec
)
368 * strftime has the entertaining property that it returns 0
369 * _either_ on out-of-space _or_ on successful generation of
370 * the empty string. Hence we must ensure our format can never
371 * generate the empty string. Somebody throw a custard pie at
372 * whoever was responsible for that. Please?
382 /* Format chars are all ASCII, so conversion to Unicode is no problem */
388 size
+= USTRFTIME_DELTA
;
389 buf
= sresize(buf
, size
, wchar_t);
390 ret
= (int) wcsftime(buf
, size
, fmt
, timespec
);
408 size
+= USTRFTIME_DELTA
;
409 buf
= sresize(buf
, size
, char);
410 ret
= (int) strftime(buf
, size
, fmt
, timespec
);
413 cvtbuf
= ufroma_locale_dup(buf
+1);
420 wchar_t *ustrftime(const wchar_t *wfmt
, const struct tm
*timespec
)
422 rdstring rs
= { 0, 0, NULL
};
428 if (wfmt
[0] == L
'%' && wfmt
[1] == L
'%') {
431 } else if (wfmt
[0] == L
'%' && wfmt
[1]) {
432 ustrftime_internal(&rs
, wfmt
[1], timespec
);
444 * Determine whether a Unicode string can be translated into a
445 * given charset without any missing characters.
447 int cvt_ok(int charset
, const wchar_t *s
)
450 charset_state state
= CHARSET_INIT_STATE
;
451 int err
, len
= ustrlen(s
);
455 (void)charset_from_unicode(&s
, &len
, buf
, lenof(buf
),
456 charset
, &state
, &err
);