d7482997 |
1 | /* |
2 | * ustring.c: Unicode string routines |
3 | */ |
4 | |
5 | #include <wchar.h> |
7e976207 |
6 | #include <stdlib.h> |
7 | #include <assert.h> |
d7482997 |
8 | #include <time.h> |
9 | #include "halibut.h" |
10 | |
e4ea58f8 |
11 | wchar_t *ustrdup(wchar_t const *s) { |
d7482997 |
12 | wchar_t *r; |
13 | if (s) { |
f1530049 |
14 | r = snewn(1+ustrlen(s), wchar_t); |
d7482997 |
15 | ustrcpy(r, s); |
16 | } else { |
f1530049 |
17 | r = snew(wchar_t); |
d7482997 |
18 | *r = 0; |
19 | } |
20 | return r; |
21 | } |
22 | |
e4ea58f8 |
23 | static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size, |
24 | int charset, int careful) { |
25 | int len, ret, err; |
26 | charset_state state = CHARSET_INIT_STATE; |
27 | |
d7482997 |
28 | if (!s) { |
29 | *outbuf = '\0'; |
30 | return outbuf; |
31 | } |
e4ea58f8 |
32 | |
33 | len = ustrlen(s); |
34 | size--; /* leave room for terminating NUL */ |
35 | *outbuf = '\0'; |
36 | while (len > 0) { |
37 | err = 0; |
38 | ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state, |
39 | (careful ? &err : NULL)); |
40 | if (err) |
41 | return NULL; |
42 | if (!ret) |
43 | return outbuf; |
44 | size -= ret; |
45 | outbuf += ret; |
46 | *outbuf = '\0'; |
47 | } |
48 | /* |
49 | * Clean up |
50 | */ |
51 | ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL); |
52 | size -= ret; |
53 | outbuf += ret; |
54 | *outbuf = '\0'; |
d7482997 |
55 | return outbuf; |
56 | } |
57 | |
e4ea58f8 |
58 | char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) { |
59 | return ustrtoa_internal(s, outbuf, size, charset, FALSE); |
60 | } |
61 | |
62 | char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) { |
63 | return ustrtoa_internal(s, outbuf, size, charset, TRUE); |
64 | } |
65 | |
66 | wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) { |
67 | int len, ret; |
68 | charset_state state = CHARSET_INIT_STATE; |
69 | |
ba9c1487 |
70 | if (!s) { |
71 | *outbuf = L'\0'; |
72 | return outbuf; |
73 | } |
e4ea58f8 |
74 | |
75 | len = strlen(s); |
76 | size--; /* allow for terminating NUL */ |
77 | *outbuf = L'\0'; |
78 | while (len > 0) { |
79 | ret = charset_to_unicode(&s, &len, outbuf, size, |
80 | charset, &state, NULL, 0); |
81 | if (!ret) |
82 | return outbuf; |
83 | outbuf += ret; |
84 | size -= ret; |
85 | *outbuf = L'\0'; |
86 | } |
ba9c1487 |
87 | return outbuf; |
88 | } |
89 | |
e4ea58f8 |
90 | char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) |
91 | { |
92 | char *outbuf; |
93 | int outpos, outlen, len, ret, err; |
94 | charset_state state = CHARSET_INIT_STATE; |
50d6b4bd |
95 | |
e4ea58f8 |
96 | if (!s) { |
97 | return dupstr(""); |
98 | } |
50d6b4bd |
99 | |
e4ea58f8 |
100 | len = ustrlen(s); |
101 | |
102 | outlen = len + 10; |
f1530049 |
103 | outbuf = snewn(outlen, char); |
e4ea58f8 |
104 | |
105 | outpos = 0; |
106 | outbuf[outpos] = '\0'; |
107 | |
108 | while (len > 0) { |
109 | err = 0; |
110 | ret = charset_from_unicode(&s, &len, |
111 | outbuf + outpos, outlen - outpos - 1, |
112 | charset, &state, (careful ? &err : NULL)); |
113 | if (err) { |
114 | sfree(outbuf); |
115 | return NULL; |
116 | } |
117 | if (!ret) { |
118 | outlen = outlen * 3 / 2; |
f1530049 |
119 | outbuf = sresize(outbuf, outlen, char); |
e4ea58f8 |
120 | } |
121 | outpos += ret; |
122 | outbuf[outpos] = '\0'; |
123 | } |
124 | /* |
125 | * Clean up |
126 | */ |
127 | outlen = outpos + 32; |
f1530049 |
128 | outbuf = sresize(outbuf, outlen, char); |
e4ea58f8 |
129 | ret = charset_from_unicode(NULL, 0, |
130 | outbuf + outpos, outlen - outpos + 1, |
131 | charset, &state, NULL); |
132 | outpos += ret; |
133 | outbuf[outpos] = '\0'; |
134 | if (lenp) |
135 | *lenp = outpos; |
136 | return outbuf; |
50d6b4bd |
137 | } |
138 | |
e4ea58f8 |
139 | char *utoa_dup(wchar_t const *s, int charset) |
140 | { |
141 | return utoa_internal_dup(s, charset, NULL, FALSE); |
142 | } |
143 | |
144 | char *utoa_dup_len(wchar_t const *s, int charset, int *len) |
145 | { |
146 | return utoa_internal_dup(s, charset, len, FALSE); |
147 | } |
148 | |
149 | char *utoa_careful_dup(wchar_t const *s, int charset) |
150 | { |
151 | return utoa_internal_dup(s, charset, NULL, TRUE); |
152 | } |
153 | |
154 | wchar_t *ufroma_dup(char const *s, int charset) { |
ba9c1487 |
155 | int len; |
156 | wchar_t *buf = NULL; |
157 | |
158 | len = strlen(s) + 1; |
159 | do { |
f1530049 |
160 | buf = sresize(buf, len, wchar_t); |
e4ea58f8 |
161 | ustrfroma(s, buf, len, charset); |
ba9c1487 |
162 | len = (3 * len) / 2 + 1; /* this guarantees a strict increase */ |
163 | } while (ustrlen(buf) >= len-1); |
164 | |
f1530049 |
165 | buf = sresize(buf, ustrlen(buf)+1, wchar_t); |
ba9c1487 |
166 | return buf; |
167 | } |
168 | |
7e976207 |
169 | char *utoa_locale_dup(wchar_t const *s) |
170 | { |
171 | /* |
172 | * This variant uses the C library locale. |
173 | */ |
174 | char *ret; |
175 | int len; |
176 | size_t siz; |
177 | |
178 | len = ustrlen(s); |
179 | |
f1530049 |
180 | ret = snewn(1 + MB_CUR_MAX * len, char); |
7e976207 |
181 | |
182 | siz = wcstombs(ret, s, len); |
183 | |
184 | if (siz) { |
8594a8fa |
185 | assert(siz <= (size_t)(MB_CUR_MAX * len)); |
7e976207 |
186 | ret[siz] = '\0'; |
f1530049 |
187 | ret = sresize(ret, siz+1, char); |
7e976207 |
188 | return ret; |
189 | } |
190 | |
191 | /* |
192 | * If that failed, try a different strategy (which we will also |
193 | * attempt in the total absence of wcstombs). Retrieve the |
194 | * locale's charset from nl_langinfo or equivalent, and use |
195 | * normal utoa_dup. |
196 | */ |
197 | return utoa_dup(s, charset_from_locale()); |
198 | } |
199 | |
200 | wchar_t *ufroma_locale_dup(char const *s) |
201 | { |
202 | /* |
203 | * This variant uses the C library locale. |
204 | */ |
205 | wchar_t *ret; |
206 | int len; |
207 | size_t siz; |
208 | |
209 | len = strlen(s); |
210 | |
f1530049 |
211 | ret = snewn(1 + 2*len, wchar_t); /* be conservative */ |
7e976207 |
212 | |
213 | siz = mbstowcs(ret, s, len); |
214 | |
215 | if (siz) { |
216 | assert(siz <= (size_t)(2 * len)); |
217 | ret[siz] = L'\0'; |
f1530049 |
218 | ret = sresize(ret, siz+1, wchar_t); |
7e976207 |
219 | return ret; |
220 | } |
221 | |
222 | /* |
223 | * If that failed, try a different strategy (which we will also |
224 | * attempt in the total absence of wcstombs). Retrieve the |
225 | * locale's charset from nl_langinfo or equivalent, and use |
226 | * normal ufroma_dup. |
227 | */ |
228 | return ufroma_dup(s, charset_from_locale()); |
229 | } |
230 | |
5dd44dce |
231 | int ustrlen(wchar_t const *s) { |
d7482997 |
232 | int len = 0; |
233 | while (*s++) len++; |
234 | return len; |
235 | } |
236 | |
237 | wchar_t *uadv(wchar_t *s) { |
238 | return s + 1 + ustrlen(s); |
239 | } |
240 | |
5dd44dce |
241 | wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) { |
d7482997 |
242 | wchar_t *ret = dest; |
243 | do { |
244 | *dest++ = *source; |
245 | } while (*source++); |
246 | return ret; |
247 | } |
248 | |
08e78486 |
249 | wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) { |
250 | wchar_t *ret = dest; |
251 | do { |
252 | *dest++ = *source; |
253 | if (*source) source++; |
254 | } while (n-- > 0); |
255 | return ret; |
256 | } |
257 | |
d7482997 |
258 | int ustrcmp(wchar_t *lhs, wchar_t *rhs) { |
259 | if (!lhs && !rhs) return 0; |
260 | if (!lhs) return -1; |
261 | if (!rhs) return +1; |
262 | while (*lhs && *rhs && *lhs==*rhs) |
263 | lhs++, rhs++; |
264 | if (*lhs < *rhs) |
265 | return -1; |
266 | else if (*lhs > *rhs) |
267 | return 1; |
268 | return 0; |
269 | } |
270 | |
271 | wchar_t utolower(wchar_t c) { |
272 | if (c == L'\0') |
273 | return c; /* this property needed by ustricmp */ |
9badd775 |
274 | #ifdef HAS_TOWLOWER |
275 | return towlower(c); |
276 | #else |
d7482997 |
277 | if (c >= 'A' && c <= 'Z') |
278 | c += 'a'-'A'; |
279 | return c; |
9badd775 |
280 | #endif |
d7482997 |
281 | } |
282 | |
831da32e |
283 | int uisalpha(wchar_t c) { |
9badd775 |
284 | #ifdef HAS_ISWALPHA |
285 | return iswalpha(c); |
286 | #else |
831da32e |
287 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
9badd775 |
288 | #endif |
831da32e |
289 | } |
290 | |
78c73085 |
291 | int ustricmp(wchar_t const *lhs, wchar_t const *rhs) { |
d7482997 |
292 | wchar_t lc, rc; |
293 | while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
294 | lhs++, rhs++; |
295 | if (!lc && !rc) |
296 | return 0; |
297 | if (lc < rc) |
298 | return -1; |
299 | else |
300 | return 1; |
301 | } |
302 | |
78c73085 |
303 | int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) { |
304 | wchar_t lc = 0, rc = 0; |
305 | while (maxlen-- > 0 && |
306 | (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
307 | lhs++, rhs++; |
308 | if (lc < rc) |
309 | return -1; |
310 | else if (lc > rc) |
311 | return 1; |
312 | else |
313 | return 0; |
314 | } |
315 | |
d7482997 |
316 | wchar_t *ustrlow(wchar_t *s) { |
317 | wchar_t *p = s; |
318 | while (*p) { |
319 | *p = utolower(*p); |
320 | p++; |
321 | } |
322 | return s; |
323 | } |
324 | |
dd567011 |
325 | int utoi(wchar_t const *s) { |
d7482997 |
326 | int sign = +1; |
327 | int n; |
328 | |
329 | if (*s == L'-') { |
330 | s++; |
331 | sign = -1; |
332 | } |
333 | |
334 | n = 0; |
335 | while (*s && *s >= L'0' && *s <= L'9') { |
336 | n *= 10; |
337 | n += (*s - '0'); |
338 | s++; |
339 | } |
340 | |
341 | return n; |
342 | } |
343 | |
dd567011 |
344 | double utof(wchar_t const *s) |
345 | { |
346 | char *cs = utoa_dup(s, CS_ASCII); |
347 | double ret = atof(cs); |
348 | sfree(cs); |
349 | return ret; |
350 | } |
351 | |
352 | int utob(wchar_t const *s) { |
d7482997 |
353 | if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") || |
354 | !ustricmp(s, L"true") || !ustricmp(s, L"t")) |
355 | return TRUE; |
356 | return FALSE; |
357 | } |
358 | |
359 | int uisdigit(wchar_t c) { |
360 | return c >= L'0' && c <= L'9'; |
361 | } |
362 | |
363 | #define USTRFTIME_DELTA 128 |
c8422236 |
364 | static void ustrftime_internal(rdstring *rs, char formatchr, |
365 | const struct tm *timespec) |
366 | { |
e4ea58f8 |
367 | /* |
d7482997 |
368 | * strftime has the entertaining property that it returns 0 |
369 | * _either_ on out-of-space _or_ on successful generation of |
370 | * the empty string. Hence we must ensure our format can never |
371 | * generate the empty string. Somebody throw a custard pie at |
372 | * whoever was responsible for that. Please? |
373 | */ |
c8422236 |
374 | |
375 | #ifdef HAS_WCSFTIME |
376 | wchar_t *buf = NULL; |
377 | wchar_t fmt[4]; |
378 | int size, ret; |
379 | |
380 | fmt[0] = L' '; |
381 | fmt[1] = L'%'; |
382 | /* Format chars are all ASCII, so conversion to Unicode is no problem */ |
383 | fmt[2] = formatchr; |
384 | fmt[3] = L'\0'; |
385 | |
386 | size = 0; |
387 | do { |
d7482997 |
388 | size += USTRFTIME_DELTA; |
f1530049 |
389 | buf = sresize(buf, size, wchar_t); |
c8422236 |
390 | ret = (int) wcsftime(buf, size, fmt, timespec); |
391 | } while (ret == 0); |
392 | |
393 | rdadds(rs, buf+1); |
394 | sfree(buf); |
395 | #else |
396 | char *buf = NULL; |
397 | wchar_t *cvtbuf; |
398 | char fmt[4]; |
399 | int size, ret; |
400 | |
401 | fmt[0] = ' '; |
402 | fmt[1] = '%'; |
403 | fmt[2] = formatchr; |
404 | fmt[3] = '\0'; |
405 | |
406 | size = 0; |
407 | do { |
408 | size += USTRFTIME_DELTA; |
f1530049 |
409 | buf = sresize(buf, size, char); |
c8422236 |
410 | ret = (int) strftime(buf, size, fmt, timespec); |
411 | } while (ret == 0); |
412 | |
413 | cvtbuf = ufroma_locale_dup(buf+1); |
414 | rdadds(rs, cvtbuf); |
415 | sfree(cvtbuf); |
416 | sfree(buf); |
417 | #endif |
418 | } |
419 | |
420 | wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec) |
421 | { |
422 | rdstring rs = { 0, 0, NULL }; |
423 | |
424 | if (!wfmt) |
425 | wfmt = L"%c"; |
426 | |
427 | while (*wfmt) { |
428 | if (wfmt[0] == L'%' && wfmt[1] == L'%') { |
429 | rdadd(&rs, L'%'); |
430 | wfmt += 2; |
431 | } else if (wfmt[0] == L'%' && wfmt[1]) { |
432 | ustrftime_internal(&rs, wfmt[1], timespec); |
433 | wfmt += 2; |
434 | } else { |
435 | rdadd(&rs, wfmt[0]); |
436 | wfmt++; |
437 | } |
d7482997 |
438 | } |
439 | |
c8422236 |
440 | return rdtrim(&rs); |
d7482997 |
441 | } |
91f93b94 |
442 | |
443 | /* |
444 | * Determine whether a Unicode string can be translated into a |
445 | * given charset without any missing characters. |
446 | */ |
447 | int cvt_ok(int charset, const wchar_t *s) |
448 | { |
449 | char buf[256]; |
450 | charset_state state = CHARSET_INIT_STATE; |
451 | int err, len = ustrlen(s); |
452 | |
453 | err = 0; |
454 | while (len > 0) { |
455 | (void)charset_from_unicode(&s, &len, buf, lenof(buf), |
456 | charset, &state, &err); |
457 | if (err) |
458 | return FALSE; |
459 | } |
460 | return TRUE; |
461 | } |
0960a3d8 |
462 | |
463 | /* |
464 | * Wrapper around charset_from_localenc which accepts the charset |
465 | * name as a wide string (since that happens to be more useful). |
466 | * Also throws a Halibut error and falls back to CS_ASCII if the |
467 | * charset is unrecognised, meaning the rest of the program can |
468 | * rely on always getting a valid charset id back from this |
469 | * function. |
470 | */ |
471 | int charset_from_ustr(filepos *fpos, const wchar_t *name) |
472 | { |
473 | char *csname; |
474 | int charset; |
475 | |
476 | csname = utoa_dup(name, CS_ASCII); |
477 | charset = charset_from_localenc(csname); |
478 | |
479 | if (charset == CS_NONE) { |
480 | charset = CS_ASCII; |
481 | error(err_charset, fpos, name); |
482 | } |
483 | |
484 | sfree(csname); |
485 | return charset; |
486 | } |