2 * ustring.c: Unicode string routines
11 wchar_t *ustrdup(wchar_t const *s
) {
14 r
= snewn(1+ustrlen(s
), wchar_t);
23 static char *ustrtoa_internal(wchar_t const *s
, char *outbuf
, int size
,
24 int charset
, int careful
) {
26 charset_state state
= CHARSET_INIT_STATE
;
34 size
--; /* leave room for terminating NUL */
38 ret
= charset_from_unicode(&s
, &len
, outbuf
, size
, charset
, &state
,
39 (careful ?
&err
: NULL
));
51 ret
= charset_from_unicode(NULL
, 0, outbuf
, size
, charset
, &state
, NULL
);
58 char *ustrtoa(wchar_t const *s
, char *outbuf
, int size
, int charset
) {
59 return ustrtoa_internal(s
, outbuf
, size
, charset
, FALSE
);
62 char *ustrtoa_careful(wchar_t const *s
, char *outbuf
, int size
, int charset
) {
63 return ustrtoa_internal(s
, outbuf
, size
, charset
, TRUE
);
66 wchar_t *ustrfroma(char const *s
, wchar_t *outbuf
, int size
, int charset
) {
68 charset_state state
= CHARSET_INIT_STATE
;
76 size
--; /* allow for terminating NUL */
79 ret
= charset_to_unicode(&s
, &len
, outbuf
, size
,
80 charset
, &state
, NULL
, 0);
90 char *utoa_internal_dup(wchar_t const *s
, int charset
, int *lenp
, int careful
)
93 int outpos
, outlen
, len
, ret
, err
;
94 charset_state state
= CHARSET_INIT_STATE
;
103 outbuf
= snewn(outlen
, char);
106 outbuf
[outpos
] = '\0';
110 ret
= charset_from_unicode(&s
, &len
,
111 outbuf
+ outpos
, outlen
- outpos
- 1,
112 charset
, &state
, (careful ?
&err
: NULL
));
118 outlen
= outlen
* 3 / 2;
119 outbuf
= sresize(outbuf
, outlen
, char);
122 outbuf
[outpos
] = '\0';
127 outlen
= outpos
+ 32;
128 outbuf
= sresize(outbuf
, outlen
, char);
129 ret
= charset_from_unicode(NULL
, 0,
130 outbuf
+ outpos
, outlen
- outpos
+ 1,
131 charset
, &state
, NULL
);
133 outbuf
[outpos
] = '\0';
139 char *utoa_dup(wchar_t const *s
, int charset
)
141 return utoa_internal_dup(s
, charset
, NULL
, FALSE
);
144 char *utoa_dup_len(wchar_t const *s
, int charset
, int *len
)
146 return utoa_internal_dup(s
, charset
, len
, FALSE
);
149 char *utoa_careful_dup(wchar_t const *s
, int charset
)
151 return utoa_internal_dup(s
, charset
, NULL
, TRUE
);
154 wchar_t *ufroma_dup(char const *s
, int charset
) {
160 buf
= sresize(buf
, len
, wchar_t);
161 ustrfroma(s
, buf
, len
, charset
);
162 len
= (3 * len
) / 2 + 1; /* this guarantees a strict increase */
163 } while (ustrlen(buf
) >= len
-1);
165 buf
= sresize(buf
, ustrlen(buf
)+1, wchar_t);
169 char *utoa_locale_dup(wchar_t const *s
)
172 * This variant uses the C library locale.
180 outlen
= 1 + MB_CUR_MAX
* len
;
181 ret
= snewn(outlen
+1, char);
183 siz
= wcstombs(ret
, s
, outlen
);
186 assert(siz
<= (size_t)(outlen
));
188 ret
= sresize(ret
, siz
+1, char);
193 * If that failed, try a different strategy (which we will also
194 * attempt in the total absence of wcstombs). Retrieve the
195 * locale's charset from nl_langinfo or equivalent, and use
198 return utoa_dup(s
, charset_from_locale());
201 wchar_t *ufroma_locale_dup(char const *s
)
204 * This variant uses the C library locale.
213 ret
= snewn(outlen
+1, wchar_t); /* be conservative */
215 siz
= mbstowcs(ret
, s
, outlen
);
218 assert(siz
<= (size_t)(outlen
));
220 ret
= sresize(ret
, siz
+1, wchar_t);
225 * If that failed, try a different strategy (which we will also
226 * attempt in the total absence of wcstombs). Retrieve the
227 * locale's charset from nl_langinfo or equivalent, and use
230 return ufroma_dup(s
, charset_from_locale());
233 int ustrlen(wchar_t const *s
) {
239 wchar_t *uadv(wchar_t *s
) {
240 return s
+ 1 + ustrlen(s
);
243 wchar_t *ustrcpy(wchar_t *dest
, wchar_t const *source
) {
251 wchar_t *ustrncpy(wchar_t *dest
, wchar_t const *source
, int n
) {
255 if (*source
) source
++;
260 int ustrcmp(wchar_t *lhs
, wchar_t *rhs
) {
261 if (!lhs
&& !rhs
) return 0;
264 while (*lhs
&& *rhs
&& *lhs
==*rhs
)
268 else if (*lhs
> *rhs
)
273 wchar_t utolower(wchar_t c
) {
275 return c
; /* this property needed by ustricmp */
279 if (c
>= 'A' && c
<= 'Z')
285 int uisalpha(wchar_t c
) {
289 return (c
>= 'A' && c
<= 'Z') || (c
>= 'a' && c
<= 'z');
293 int ustricmp(wchar_t const *lhs
, wchar_t const *rhs
) {
295 while ((lc
= utolower(*lhs
)) == (rc
= utolower(*rhs
)) && lc
&& rc
)
305 int ustrnicmp(wchar_t const *lhs
, wchar_t const *rhs
, int maxlen
) {
306 wchar_t lc
= 0, rc
= 0;
307 while (maxlen
-- > 0 &&
308 (lc
= utolower(*lhs
)) == (rc
= utolower(*rhs
)) && lc
&& rc
)
318 wchar_t *ustrlow(wchar_t *s
) {
327 int utoi(wchar_t const *s
) {
337 while (*s
&& *s
>= L
'0' && *s
<= L
'9') {
346 double utof(wchar_t const *s
)
348 char *cs
= utoa_dup(s
, CS_ASCII
);
349 double ret
= atof(cs
);
354 int utob(wchar_t const *s
) {
355 if (!ustricmp(s
, L
"yes") || !ustricmp(s
, L
"y") ||
356 !ustricmp(s
, L
"true") || !ustricmp(s
, L
"t"))
361 int uisdigit(wchar_t c
) {
362 return c
>= L
'0' && c
<= L
'9';
365 #define USTRFTIME_DELTA 128
366 static void ustrftime_internal(rdstring
*rs
, char formatchr
,
367 const struct tm
*timespec
)
370 * strftime has the entertaining property that it returns 0
371 * _either_ on out-of-space _or_ on successful generation of
372 * the empty string. Hence we must ensure our format can never
373 * generate the empty string. Somebody throw a custard pie at
374 * whoever was responsible for that. Please?
384 /* Format chars are all ASCII, so conversion to Unicode is no problem */
390 size
+= USTRFTIME_DELTA
;
391 buf
= sresize(buf
, size
, wchar_t);
392 ret
= (int) wcsftime(buf
, size
, fmt
, timespec
);
410 size
+= USTRFTIME_DELTA
;
411 buf
= sresize(buf
, size
, char);
412 ret
= (int) strftime(buf
, size
, fmt
, timespec
);
415 cvtbuf
= ufroma_locale_dup(buf
+1);
422 wchar_t *ustrftime(const wchar_t *wfmt
, const struct tm
*timespec
)
424 rdstring rs
= { 0, 0, NULL
};
430 if (wfmt
[0] == L
'%' && wfmt
[1] == L
'%') {
433 } else if (wfmt
[0] == L
'%' && wfmt
[1]) {
434 ustrftime_internal(&rs
, wfmt
[1], timespec
);
446 * Determine whether a Unicode string can be translated into a
447 * given charset without any missing characters.
449 int cvt_ok(int charset
, const wchar_t *s
)
452 charset_state state
= CHARSET_INIT_STATE
;
453 int err
, len
= ustrlen(s
);
457 (void)charset_from_unicode(&s
, &len
, buf
, lenof(buf
),
458 charset
, &state
, &err
);
466 * Wrapper around charset_from_localenc which accepts the charset
467 * name as a wide string (since that happens to be more useful).
468 * Also throws a Halibut error and falls back to CS_ASCII if the
469 * charset is unrecognised, meaning the rest of the program can
470 * rely on always getting a valid charset id back from this
473 int charset_from_ustr(filepos
*fpos
, const wchar_t *name
)
478 csname
= utoa_dup(name
, CS_ASCII
);
479 charset
= charset_from_localenc(csname
);
481 if (charset
== CS_NONE
) {
483 error(err_charset
, fpos
, name
);