d7482997 |
1 | /* |
2 | * ustring.c: Unicode string routines |
3 | */ |
4 | |
5 | #include <wchar.h> |
7e976207 |
6 | #include <stdlib.h> |
7 | #include <assert.h> |
d7482997 |
8 | #include <time.h> |
9 | #include "halibut.h" |
10 | |
e4ea58f8 |
11 | wchar_t *ustrdup(wchar_t const *s) { |
d7482997 |
12 | wchar_t *r; |
13 | if (s) { |
14 | r = mknewa(wchar_t, 1+ustrlen(s)); |
15 | ustrcpy(r, s); |
16 | } else { |
17 | r = mknew(wchar_t); |
18 | *r = 0; |
19 | } |
20 | return r; |
21 | } |
22 | |
e4ea58f8 |
23 | static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size, |
24 | int charset, int careful) { |
25 | int len, ret, err; |
26 | charset_state state = CHARSET_INIT_STATE; |
27 | |
d7482997 |
28 | if (!s) { |
29 | *outbuf = '\0'; |
30 | return outbuf; |
31 | } |
e4ea58f8 |
32 | |
33 | len = ustrlen(s); |
34 | size--; /* leave room for terminating NUL */ |
35 | *outbuf = '\0'; |
36 | while (len > 0) { |
37 | err = 0; |
38 | ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state, |
39 | (careful ? &err : NULL)); |
40 | if (err) |
41 | return NULL; |
42 | if (!ret) |
43 | return outbuf; |
44 | size -= ret; |
45 | outbuf += ret; |
46 | *outbuf = '\0'; |
47 | } |
48 | /* |
49 | * Clean up |
50 | */ |
51 | ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL); |
52 | size -= ret; |
53 | outbuf += ret; |
54 | *outbuf = '\0'; |
d7482997 |
55 | return outbuf; |
56 | } |
57 | |
e4ea58f8 |
58 | char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) { |
59 | return ustrtoa_internal(s, outbuf, size, charset, FALSE); |
60 | } |
61 | |
62 | char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) { |
63 | return ustrtoa_internal(s, outbuf, size, charset, TRUE); |
64 | } |
65 | |
66 | wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) { |
67 | int len, ret; |
68 | charset_state state = CHARSET_INIT_STATE; |
69 | |
ba9c1487 |
70 | if (!s) { |
71 | *outbuf = L'\0'; |
72 | return outbuf; |
73 | } |
e4ea58f8 |
74 | |
75 | len = strlen(s); |
76 | size--; /* allow for terminating NUL */ |
77 | *outbuf = L'\0'; |
78 | while (len > 0) { |
79 | ret = charset_to_unicode(&s, &len, outbuf, size, |
80 | charset, &state, NULL, 0); |
81 | if (!ret) |
82 | return outbuf; |
83 | outbuf += ret; |
84 | size -= ret; |
85 | *outbuf = L'\0'; |
86 | } |
ba9c1487 |
87 | return outbuf; |
88 | } |
89 | |
e4ea58f8 |
90 | char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) |
91 | { |
92 | char *outbuf; |
93 | int outpos, outlen, len, ret, err; |
94 | charset_state state = CHARSET_INIT_STATE; |
50d6b4bd |
95 | |
e4ea58f8 |
96 | if (!s) { |
97 | return dupstr(""); |
98 | } |
50d6b4bd |
99 | |
e4ea58f8 |
100 | len = ustrlen(s); |
101 | |
102 | outlen = len + 10; |
103 | outbuf = mknewa(char, outlen); |
104 | |
105 | outpos = 0; |
106 | outbuf[outpos] = '\0'; |
107 | |
108 | while (len > 0) { |
109 | err = 0; |
110 | ret = charset_from_unicode(&s, &len, |
111 | outbuf + outpos, outlen - outpos - 1, |
112 | charset, &state, (careful ? &err : NULL)); |
113 | if (err) { |
114 | sfree(outbuf); |
115 | return NULL; |
116 | } |
117 | if (!ret) { |
118 | outlen = outlen * 3 / 2; |
119 | outbuf = resize(outbuf, outlen); |
120 | } |
121 | outpos += ret; |
122 | outbuf[outpos] = '\0'; |
123 | } |
124 | /* |
125 | * Clean up |
126 | */ |
127 | outlen = outpos + 32; |
128 | outbuf = resize(outbuf, outlen); |
129 | ret = charset_from_unicode(NULL, 0, |
130 | outbuf + outpos, outlen - outpos + 1, |
131 | charset, &state, NULL); |
132 | outpos += ret; |
133 | outbuf[outpos] = '\0'; |
134 | if (lenp) |
135 | *lenp = outpos; |
136 | return outbuf; |
50d6b4bd |
137 | } |
138 | |
e4ea58f8 |
139 | char *utoa_dup(wchar_t const *s, int charset) |
140 | { |
141 | return utoa_internal_dup(s, charset, NULL, FALSE); |
142 | } |
143 | |
144 | char *utoa_dup_len(wchar_t const *s, int charset, int *len) |
145 | { |
146 | return utoa_internal_dup(s, charset, len, FALSE); |
147 | } |
148 | |
149 | char *utoa_careful_dup(wchar_t const *s, int charset) |
150 | { |
151 | return utoa_internal_dup(s, charset, NULL, TRUE); |
152 | } |
153 | |
154 | wchar_t *ufroma_dup(char const *s, int charset) { |
ba9c1487 |
155 | int len; |
156 | wchar_t *buf = NULL; |
157 | |
158 | len = strlen(s) + 1; |
159 | do { |
160 | buf = resize(buf, len); |
e4ea58f8 |
161 | ustrfroma(s, buf, len, charset); |
ba9c1487 |
162 | len = (3 * len) / 2 + 1; /* this guarantees a strict increase */ |
163 | } while (ustrlen(buf) >= len-1); |
164 | |
165 | buf = resize(buf, ustrlen(buf)+1); |
166 | return buf; |
167 | } |
168 | |
7e976207 |
169 | char *utoa_locale_dup(wchar_t const *s) |
170 | { |
171 | /* |
172 | * This variant uses the C library locale. |
173 | */ |
174 | char *ret; |
175 | int len; |
176 | size_t siz; |
177 | |
178 | len = ustrlen(s); |
179 | |
180 | ret = mknewa(char, 1 + MB_CUR_MAX * len); |
181 | |
182 | siz = wcstombs(ret, s, len); |
183 | |
184 | if (siz) { |
185 | assert(siz <= MB_CUR_MAX * len); |
186 | ret[siz] = '\0'; |
187 | ret = resize(ret, siz+1); |
188 | return ret; |
189 | } |
190 | |
191 | /* |
192 | * If that failed, try a different strategy (which we will also |
193 | * attempt in the total absence of wcstombs). Retrieve the |
194 | * locale's charset from nl_langinfo or equivalent, and use |
195 | * normal utoa_dup. |
196 | */ |
197 | return utoa_dup(s, charset_from_locale()); |
198 | } |
199 | |
200 | wchar_t *ufroma_locale_dup(char const *s) |
201 | { |
202 | /* |
203 | * This variant uses the C library locale. |
204 | */ |
205 | wchar_t *ret; |
206 | int len; |
207 | size_t siz; |
208 | |
209 | len = strlen(s); |
210 | |
211 | ret = mknewa(wchar_t, 1 + 2*len); /* be conservative */ |
212 | |
213 | siz = mbstowcs(ret, s, len); |
214 | |
215 | if (siz) { |
216 | assert(siz <= (size_t)(2 * len)); |
217 | ret[siz] = L'\0'; |
218 | ret = resize(ret, siz+1); |
219 | return ret; |
220 | } |
221 | |
222 | /* |
223 | * If that failed, try a different strategy (which we will also |
224 | * attempt in the total absence of wcstombs). Retrieve the |
225 | * locale's charset from nl_langinfo or equivalent, and use |
226 | * normal ufroma_dup. |
227 | */ |
228 | return ufroma_dup(s, charset_from_locale()); |
229 | } |
230 | |
5dd44dce |
231 | int ustrlen(wchar_t const *s) { |
d7482997 |
232 | int len = 0; |
233 | while (*s++) len++; |
234 | return len; |
235 | } |
236 | |
237 | wchar_t *uadv(wchar_t *s) { |
238 | return s + 1 + ustrlen(s); |
239 | } |
240 | |
5dd44dce |
241 | wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) { |
d7482997 |
242 | wchar_t *ret = dest; |
243 | do { |
244 | *dest++ = *source; |
245 | } while (*source++); |
246 | return ret; |
247 | } |
248 | |
249 | int ustrcmp(wchar_t *lhs, wchar_t *rhs) { |
250 | if (!lhs && !rhs) return 0; |
251 | if (!lhs) return -1; |
252 | if (!rhs) return +1; |
253 | while (*lhs && *rhs && *lhs==*rhs) |
254 | lhs++, rhs++; |
255 | if (*lhs < *rhs) |
256 | return -1; |
257 | else if (*lhs > *rhs) |
258 | return 1; |
259 | return 0; |
260 | } |
261 | |
262 | wchar_t utolower(wchar_t c) { |
263 | if (c == L'\0') |
264 | return c; /* this property needed by ustricmp */ |
9badd775 |
265 | #ifdef HAS_TOWLOWER |
266 | return towlower(c); |
267 | #else |
d7482997 |
268 | if (c >= 'A' && c <= 'Z') |
269 | c += 'a'-'A'; |
270 | return c; |
9badd775 |
271 | #endif |
d7482997 |
272 | } |
273 | |
831da32e |
274 | int uisalpha(wchar_t c) { |
9badd775 |
275 | #ifdef HAS_ISWALPHA |
276 | return iswalpha(c); |
277 | #else |
831da32e |
278 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
9badd775 |
279 | #endif |
831da32e |
280 | } |
281 | |
d7482997 |
282 | int ustricmp(wchar_t *lhs, wchar_t *rhs) { |
283 | wchar_t lc, rc; |
284 | while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
285 | lhs++, rhs++; |
286 | if (!lc && !rc) |
287 | return 0; |
288 | if (lc < rc) |
289 | return -1; |
290 | else |
291 | return 1; |
292 | } |
293 | |
294 | wchar_t *ustrlow(wchar_t *s) { |
295 | wchar_t *p = s; |
296 | while (*p) { |
297 | *p = utolower(*p); |
298 | p++; |
299 | } |
300 | return s; |
301 | } |
302 | |
303 | int utoi(wchar_t *s) { |
304 | int sign = +1; |
305 | int n; |
306 | |
307 | if (*s == L'-') { |
308 | s++; |
309 | sign = -1; |
310 | } |
311 | |
312 | n = 0; |
313 | while (*s && *s >= L'0' && *s <= L'9') { |
314 | n *= 10; |
315 | n += (*s - '0'); |
316 | s++; |
317 | } |
318 | |
319 | return n; |
320 | } |
321 | |
322 | int utob(wchar_t *s) { |
323 | if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") || |
324 | !ustricmp(s, L"true") || !ustricmp(s, L"t")) |
325 | return TRUE; |
326 | return FALSE; |
327 | } |
328 | |
329 | int uisdigit(wchar_t c) { |
330 | return c >= L'0' && c <= L'9'; |
331 | } |
332 | |
333 | #define USTRFTIME_DELTA 128 |
334 | wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) { |
335 | void *blk = NULL; |
336 | wchar_t *wblk, *wp; |
337 | char *fmt, *text, *p; |
338 | size_t size = 0; |
339 | size_t len; |
340 | |
341 | /* |
e4ea58f8 |
342 | * FIXME: really we ought to copy non-% parts of the format |
343 | * ourselves, and only resort to strftime for % parts. Also we |
344 | * should use wcsftime if it's present. |
345 | */ |
346 | |
347 | /* |
d7482997 |
348 | * strftime has the entertaining property that it returns 0 |
349 | * _either_ on out-of-space _or_ on successful generation of |
350 | * the empty string. Hence we must ensure our format can never |
351 | * generate the empty string. Somebody throw a custard pie at |
352 | * whoever was responsible for that. Please? |
353 | */ |
354 | if (wfmt) { |
355 | len = ustrlen(wfmt); |
356 | fmt = mknewa(char, 2+len); |
e4ea58f8 |
357 | ustrtoa(wfmt, fmt+1, len+1, CS_ASCII); /* CS_FIXME? */ |
d7482997 |
358 | fmt[0] = ' '; |
359 | } else |
360 | fmt = " %c"; |
361 | |
362 | while (1) { |
363 | size += USTRFTIME_DELTA; |
364 | blk = resize((char *)blk, size); |
365 | len = strftime((char *)blk, size-1, fmt, timespec); |
366 | if (len > 0) |
367 | break; |
368 | } |
369 | |
370 | /* Note: +1 for the terminating 0, -1 for the initial space in fmt */ |
371 | wblk = resize((wchar_t *)blk, len); |
372 | text = mknewa(char, len); |
373 | strftime(text, len, fmt+1, timespec); |
374 | /* |
375 | * We operate in the C locale, so this all ought to be kosher |
376 | * ASCII. If we ever move outside ASCII machines, we may need |
377 | * to make this more portable... |
378 | */ |
379 | for (wp = wblk, p = text; *p; p++, wp++) |
380 | *wp = *p; |
381 | *wp = 0; |
382 | if (wfmt) |
383 | sfree(fmt); |
384 | sfree(text); |
385 | return wblk; |
386 | } |
91f93b94 |
387 | |
388 | /* |
389 | * Determine whether a Unicode string can be translated into a |
390 | * given charset without any missing characters. |
391 | */ |
392 | int cvt_ok(int charset, const wchar_t *s) |
393 | { |
394 | char buf[256]; |
395 | charset_state state = CHARSET_INIT_STATE; |
396 | int err, len = ustrlen(s); |
397 | |
398 | err = 0; |
399 | while (len > 0) { |
400 | (void)charset_from_unicode(&s, &len, buf, lenof(buf), |
401 | charset, &state, &err); |
402 | if (err) |
403 | return FALSE; |
404 | } |
405 | return TRUE; |
406 | } |