2 * ustring.c: Unicode string routines
11 wchar_t *ustrdup(wchar_t const *s
) {
14 r
= mknewa(wchar_t, 1+ustrlen(s
));
23 static char *ustrtoa_internal(wchar_t const *s
, char *outbuf
, int size
,
24 int charset
, int careful
) {
26 charset_state state
= CHARSET_INIT_STATE
;
34 size
--; /* leave room for terminating NUL */
38 ret
= charset_from_unicode(&s
, &len
, outbuf
, size
, charset
, &state
,
39 (careful ?
&err
: NULL
));
51 ret
= charset_from_unicode(NULL
, 0, outbuf
, size
, charset
, &state
, NULL
);
58 char *ustrtoa(wchar_t const *s
, char *outbuf
, int size
, int charset
) {
59 return ustrtoa_internal(s
, outbuf
, size
, charset
, FALSE
);
62 char *ustrtoa_careful(wchar_t const *s
, char *outbuf
, int size
, int charset
) {
63 return ustrtoa_internal(s
, outbuf
, size
, charset
, TRUE
);
66 wchar_t *ustrfroma(char const *s
, wchar_t *outbuf
, int size
, int charset
) {
68 charset_state state
= CHARSET_INIT_STATE
;
76 size
--; /* allow for terminating NUL */
79 ret
= charset_to_unicode(&s
, &len
, outbuf
, size
,
80 charset
, &state
, NULL
, 0);
90 char *utoa_internal_dup(wchar_t const *s
, int charset
, int *lenp
, int careful
)
93 int outpos
, outlen
, len
, ret
, err
;
94 charset_state state
= CHARSET_INIT_STATE
;
103 outbuf
= mknewa(char, outlen
);
106 outbuf
[outpos
] = '\0';
110 ret
= charset_from_unicode(&s
, &len
,
111 outbuf
+ outpos
, outlen
- outpos
- 1,
112 charset
, &state
, (careful ?
&err
: NULL
));
118 outlen
= outlen
* 3 / 2;
119 outbuf
= resize(outbuf
, outlen
);
122 outbuf
[outpos
] = '\0';
127 outlen
= outpos
+ 32;
128 outbuf
= resize(outbuf
, outlen
);
129 ret
= charset_from_unicode(NULL
, 0,
130 outbuf
+ outpos
, outlen
- outpos
+ 1,
131 charset
, &state
, NULL
);
133 outbuf
[outpos
] = '\0';
139 char *utoa_dup(wchar_t const *s
, int charset
)
141 return utoa_internal_dup(s
, charset
, NULL
, FALSE
);
144 char *utoa_dup_len(wchar_t const *s
, int charset
, int *len
)
146 return utoa_internal_dup(s
, charset
, len
, FALSE
);
149 char *utoa_careful_dup(wchar_t const *s
, int charset
)
151 return utoa_internal_dup(s
, charset
, NULL
, TRUE
);
154 wchar_t *ufroma_dup(char const *s
, int charset
) {
160 buf
= resize(buf
, len
);
161 ustrfroma(s
, buf
, len
, charset
);
162 len
= (3 * len
) / 2 + 1; /* this guarantees a strict increase */
163 } while (ustrlen(buf
) >= len
-1);
165 buf
= resize(buf
, ustrlen(buf
)+1);
169 char *utoa_locale_dup(wchar_t const *s
)
172 * This variant uses the C library locale.
180 ret
= mknewa(char, 1 + MB_CUR_MAX
* len
);
182 siz
= wcstombs(ret
, s
, len
);
185 assert(siz
<= MB_CUR_MAX
* len
);
187 ret
= resize(ret
, siz
+1);
192 * If that failed, try a different strategy (which we will also
193 * attempt in the total absence of wcstombs). Retrieve the
194 * locale's charset from nl_langinfo or equivalent, and use
197 return utoa_dup(s
, charset_from_locale());
200 wchar_t *ufroma_locale_dup(char const *s
)
203 * This variant uses the C library locale.
211 ret
= mknewa(wchar_t, 1 + 2*len
); /* be conservative */
213 siz
= mbstowcs(ret
, s
, len
);
216 assert(siz
<= (size_t)(2 * len
));
218 ret
= resize(ret
, siz
+1);
223 * If that failed, try a different strategy (which we will also
224 * attempt in the total absence of wcstombs). Retrieve the
225 * locale's charset from nl_langinfo or equivalent, and use
228 return ufroma_dup(s
, charset_from_locale());
231 int ustrlen(wchar_t const *s
) {
237 wchar_t *uadv(wchar_t *s
) {
238 return s
+ 1 + ustrlen(s
);
241 wchar_t *ustrcpy(wchar_t *dest
, wchar_t const *source
) {
249 int ustrcmp(wchar_t *lhs
, wchar_t *rhs
) {
250 if (!lhs
&& !rhs
) return 0;
253 while (*lhs
&& *rhs
&& *lhs
==*rhs
)
257 else if (*lhs
> *rhs
)
262 wchar_t utolower(wchar_t c
) {
264 return c
; /* this property needed by ustricmp */
268 if (c
>= 'A' && c
<= 'Z')
274 int uisalpha(wchar_t c
) {
278 return (c
>= 'A' && c
<= 'Z') || (c
>= 'a' && c
<= 'z');
282 int ustricmp(wchar_t *lhs
, wchar_t *rhs
) {
284 while ((lc
= utolower(*lhs
)) == (rc
= utolower(*rhs
)) && lc
&& rc
)
294 wchar_t *ustrlow(wchar_t *s
) {
303 int utoi(wchar_t *s
) {
313 while (*s
&& *s
>= L
'0' && *s
<= L
'9') {
322 int utob(wchar_t *s
) {
323 if (!ustricmp(s
, L
"yes") || !ustricmp(s
, L
"y") ||
324 !ustricmp(s
, L
"true") || !ustricmp(s
, L
"t"))
329 int uisdigit(wchar_t c
) {
330 return c
>= L
'0' && c
<= L
'9';
333 #define USTRFTIME_DELTA 128
334 wchar_t *ustrftime(wchar_t *wfmt
, struct tm
*timespec
) {
337 char *fmt
, *text
, *p
;
342 * FIXME: really we ought to copy non-% parts of the format
343 * ourselves, and only resort to strftime for % parts. Also we
344 * should use wcsftime if it's present.
348 * strftime has the entertaining property that it returns 0
349 * _either_ on out-of-space _or_ on successful generation of
350 * the empty string. Hence we must ensure our format can never
351 * generate the empty string. Somebody throw a custard pie at
352 * whoever was responsible for that. Please?
356 fmt
= mknewa(char, 2+len
);
357 ustrtoa(wfmt
, fmt
+1, len
+1, CS_ASCII
); /* CS_FIXME? */
363 size
+= USTRFTIME_DELTA
;
364 blk
= resize((char *)blk
, size
);
365 len
= strftime((char *)blk
, size
-1, fmt
, timespec
);
370 /* Note: +1 for the terminating 0, -1 for the initial space in fmt */
371 wblk
= resize((wchar_t *)blk
, len
);
372 text
= mknewa(char, len
);
373 strftime(text
, len
, fmt
+1, timespec
);
375 * We operate in the C locale, so this all ought to be kosher
376 * ASCII. If we ever move outside ASCII machines, we may need
377 * to make this more portable...
379 for (wp
= wblk
, p
= text
; *p
; p
++, wp
++)
389 * Determine whether a Unicode string can be translated into a
390 * given charset without any missing characters.
392 int cvt_ok(int charset
, const wchar_t *s
)
395 charset_state state
= CHARSET_INIT_STATE
;
396 int err
, len
= ustrlen(s
);
400 (void)charset_from_unicode(&s
, &len
, buf
, lenof(buf
),
401 charset
, &state
, &err
);