d7482997 |
1 | /* |
2 | * ustring.c: Unicode string routines |
3 | */ |
4 | |
5 | #include <wchar.h> |
7e976207 |
6 | #include <stdlib.h> |
7 | #include <assert.h> |
d7482997 |
8 | #include <time.h> |
9 | #include "halibut.h" |
10 | |
e4ea58f8 |
11 | wchar_t *ustrdup(wchar_t const *s) { |
d7482997 |
12 | wchar_t *r; |
13 | if (s) { |
f1530049 |
14 | r = snewn(1+ustrlen(s), wchar_t); |
d7482997 |
15 | ustrcpy(r, s); |
16 | } else { |
f1530049 |
17 | r = snew(wchar_t); |
d7482997 |
18 | *r = 0; |
19 | } |
20 | return r; |
21 | } |
22 | |
e4ea58f8 |
23 | static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size, |
24 | int charset, int careful) { |
25 | int len, ret, err; |
26 | charset_state state = CHARSET_INIT_STATE; |
27 | |
d7482997 |
28 | if (!s) { |
29 | *outbuf = '\0'; |
30 | return outbuf; |
31 | } |
e4ea58f8 |
32 | |
33 | len = ustrlen(s); |
34 | size--; /* leave room for terminating NUL */ |
35 | *outbuf = '\0'; |
36 | while (len > 0) { |
37 | err = 0; |
38 | ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state, |
39 | (careful ? &err : NULL)); |
40 | if (err) |
41 | return NULL; |
42 | if (!ret) |
43 | return outbuf; |
44 | size -= ret; |
45 | outbuf += ret; |
46 | *outbuf = '\0'; |
47 | } |
48 | /* |
49 | * Clean up |
50 | */ |
51 | ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL); |
52 | size -= ret; |
53 | outbuf += ret; |
54 | *outbuf = '\0'; |
d7482997 |
55 | return outbuf; |
56 | } |
57 | |
e4ea58f8 |
58 | char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) { |
59 | return ustrtoa_internal(s, outbuf, size, charset, FALSE); |
60 | } |
61 | |
62 | char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) { |
63 | return ustrtoa_internal(s, outbuf, size, charset, TRUE); |
64 | } |
65 | |
66 | wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) { |
67 | int len, ret; |
68 | charset_state state = CHARSET_INIT_STATE; |
69 | |
ba9c1487 |
70 | if (!s) { |
71 | *outbuf = L'\0'; |
72 | return outbuf; |
73 | } |
e4ea58f8 |
74 | |
75 | len = strlen(s); |
76 | size--; /* allow for terminating NUL */ |
77 | *outbuf = L'\0'; |
78 | while (len > 0) { |
79 | ret = charset_to_unicode(&s, &len, outbuf, size, |
80 | charset, &state, NULL, 0); |
81 | if (!ret) |
82 | return outbuf; |
83 | outbuf += ret; |
84 | size -= ret; |
85 | *outbuf = L'\0'; |
86 | } |
ba9c1487 |
87 | return outbuf; |
88 | } |
89 | |
e4ea58f8 |
90 | char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) |
91 | { |
92 | char *outbuf; |
93 | int outpos, outlen, len, ret, err; |
94 | charset_state state = CHARSET_INIT_STATE; |
50d6b4bd |
95 | |
e4ea58f8 |
96 | if (!s) { |
97 | return dupstr(""); |
98 | } |
50d6b4bd |
99 | |
e4ea58f8 |
100 | len = ustrlen(s); |
101 | |
102 | outlen = len + 10; |
f1530049 |
103 | outbuf = snewn(outlen, char); |
e4ea58f8 |
104 | |
105 | outpos = 0; |
106 | outbuf[outpos] = '\0'; |
107 | |
108 | while (len > 0) { |
109 | err = 0; |
110 | ret = charset_from_unicode(&s, &len, |
111 | outbuf + outpos, outlen - outpos - 1, |
112 | charset, &state, (careful ? &err : NULL)); |
113 | if (err) { |
114 | sfree(outbuf); |
115 | return NULL; |
116 | } |
117 | if (!ret) { |
118 | outlen = outlen * 3 / 2; |
f1530049 |
119 | outbuf = sresize(outbuf, outlen, char); |
e4ea58f8 |
120 | } |
121 | outpos += ret; |
122 | outbuf[outpos] = '\0'; |
123 | } |
124 | /* |
125 | * Clean up |
126 | */ |
127 | outlen = outpos + 32; |
f1530049 |
128 | outbuf = sresize(outbuf, outlen, char); |
e4ea58f8 |
129 | ret = charset_from_unicode(NULL, 0, |
130 | outbuf + outpos, outlen - outpos + 1, |
131 | charset, &state, NULL); |
132 | outpos += ret; |
133 | outbuf[outpos] = '\0'; |
134 | if (lenp) |
135 | *lenp = outpos; |
136 | return outbuf; |
50d6b4bd |
137 | } |
138 | |
e4ea58f8 |
139 | char *utoa_dup(wchar_t const *s, int charset) |
140 | { |
141 | return utoa_internal_dup(s, charset, NULL, FALSE); |
142 | } |
143 | |
144 | char *utoa_dup_len(wchar_t const *s, int charset, int *len) |
145 | { |
146 | return utoa_internal_dup(s, charset, len, FALSE); |
147 | } |
148 | |
149 | char *utoa_careful_dup(wchar_t const *s, int charset) |
150 | { |
151 | return utoa_internal_dup(s, charset, NULL, TRUE); |
152 | } |
153 | |
154 | wchar_t *ufroma_dup(char const *s, int charset) { |
ba9c1487 |
155 | int len; |
156 | wchar_t *buf = NULL; |
157 | |
158 | len = strlen(s) + 1; |
159 | do { |
f1530049 |
160 | buf = sresize(buf, len, wchar_t); |
e4ea58f8 |
161 | ustrfroma(s, buf, len, charset); |
ba9c1487 |
162 | len = (3 * len) / 2 + 1; /* this guarantees a strict increase */ |
163 | } while (ustrlen(buf) >= len-1); |
164 | |
f1530049 |
165 | buf = sresize(buf, ustrlen(buf)+1, wchar_t); |
ba9c1487 |
166 | return buf; |
167 | } |
168 | |
7e976207 |
169 | char *utoa_locale_dup(wchar_t const *s) |
170 | { |
171 | /* |
172 | * This variant uses the C library locale. |
173 | */ |
174 | char *ret; |
8281de1b |
175 | int len, outlen; |
7e976207 |
176 | size_t siz; |
177 | |
178 | len = ustrlen(s); |
179 | |
8281de1b |
180 | outlen = 1 + MB_CUR_MAX * len; |
181 | ret = snewn(outlen+1, char); |
7e976207 |
182 | |
8281de1b |
183 | siz = wcstombs(ret, s, outlen); |
7e976207 |
184 | |
185 | if (siz) { |
8281de1b |
186 | assert(siz <= (size_t)(outlen)); |
7e976207 |
187 | ret[siz] = '\0'; |
f1530049 |
188 | ret = sresize(ret, siz+1, char); |
7e976207 |
189 | return ret; |
190 | } |
191 | |
192 | /* |
193 | * If that failed, try a different strategy (which we will also |
194 | * attempt in the total absence of wcstombs). Retrieve the |
195 | * locale's charset from nl_langinfo or equivalent, and use |
196 | * normal utoa_dup. |
197 | */ |
198 | return utoa_dup(s, charset_from_locale()); |
199 | } |
200 | |
201 | wchar_t *ufroma_locale_dup(char const *s) |
202 | { |
203 | /* |
204 | * This variant uses the C library locale. |
205 | */ |
206 | wchar_t *ret; |
8281de1b |
207 | int len, outlen; |
7e976207 |
208 | size_t siz; |
209 | |
210 | len = strlen(s); |
211 | |
8281de1b |
212 | outlen = 1 + 2*len; |
213 | ret = snewn(outlen+1, wchar_t); /* be conservative */ |
7e976207 |
214 | |
8281de1b |
215 | siz = mbstowcs(ret, s, outlen); |
7e976207 |
216 | |
217 | if (siz) { |
8281de1b |
218 | assert(siz <= (size_t)(outlen)); |
7e976207 |
219 | ret[siz] = L'\0'; |
f1530049 |
220 | ret = sresize(ret, siz+1, wchar_t); |
7e976207 |
221 | return ret; |
222 | } |
223 | |
224 | /* |
225 | * If that failed, try a different strategy (which we will also |
226 | * attempt in the total absence of wcstombs). Retrieve the |
227 | * locale's charset from nl_langinfo or equivalent, and use |
228 | * normal ufroma_dup. |
229 | */ |
230 | return ufroma_dup(s, charset_from_locale()); |
231 | } |
232 | |
5dd44dce |
233 | int ustrlen(wchar_t const *s) { |
d7482997 |
234 | int len = 0; |
235 | while (*s++) len++; |
236 | return len; |
237 | } |
238 | |
239 | wchar_t *uadv(wchar_t *s) { |
240 | return s + 1 + ustrlen(s); |
241 | } |
242 | |
5dd44dce |
243 | wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) { |
d7482997 |
244 | wchar_t *ret = dest; |
245 | do { |
246 | *dest++ = *source; |
247 | } while (*source++); |
248 | return ret; |
249 | } |
250 | |
08e78486 |
251 | wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) { |
252 | wchar_t *ret = dest; |
253 | do { |
254 | *dest++ = *source; |
255 | if (*source) source++; |
256 | } while (n-- > 0); |
257 | return ret; |
258 | } |
259 | |
d7482997 |
260 | int ustrcmp(wchar_t *lhs, wchar_t *rhs) { |
261 | if (!lhs && !rhs) return 0; |
262 | if (!lhs) return -1; |
263 | if (!rhs) return +1; |
264 | while (*lhs && *rhs && *lhs==*rhs) |
265 | lhs++, rhs++; |
266 | if (*lhs < *rhs) |
267 | return -1; |
268 | else if (*lhs > *rhs) |
269 | return 1; |
270 | return 0; |
271 | } |
272 | |
273 | wchar_t utolower(wchar_t c) { |
274 | if (c == L'\0') |
275 | return c; /* this property needed by ustricmp */ |
9badd775 |
276 | #ifdef HAS_TOWLOWER |
277 | return towlower(c); |
278 | #else |
d7482997 |
279 | if (c >= 'A' && c <= 'Z') |
280 | c += 'a'-'A'; |
281 | return c; |
9badd775 |
282 | #endif |
d7482997 |
283 | } |
284 | |
831da32e |
285 | int uisalpha(wchar_t c) { |
9badd775 |
286 | #ifdef HAS_ISWALPHA |
287 | return iswalpha(c); |
288 | #else |
831da32e |
289 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
9badd775 |
290 | #endif |
831da32e |
291 | } |
292 | |
78c73085 |
293 | int ustricmp(wchar_t const *lhs, wchar_t const *rhs) { |
d7482997 |
294 | wchar_t lc, rc; |
295 | while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
296 | lhs++, rhs++; |
297 | if (!lc && !rc) |
298 | return 0; |
299 | if (lc < rc) |
300 | return -1; |
301 | else |
302 | return 1; |
303 | } |
304 | |
78c73085 |
305 | int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) { |
306 | wchar_t lc = 0, rc = 0; |
307 | while (maxlen-- > 0 && |
308 | (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
309 | lhs++, rhs++; |
310 | if (lc < rc) |
311 | return -1; |
312 | else if (lc > rc) |
313 | return 1; |
314 | else |
315 | return 0; |
316 | } |
317 | |
d7482997 |
318 | wchar_t *ustrlow(wchar_t *s) { |
319 | wchar_t *p = s; |
320 | while (*p) { |
321 | *p = utolower(*p); |
322 | p++; |
323 | } |
324 | return s; |
325 | } |
326 | |
dd567011 |
327 | int utoi(wchar_t const *s) { |
d7482997 |
328 | int sign = +1; |
329 | int n; |
330 | |
331 | if (*s == L'-') { |
332 | s++; |
333 | sign = -1; |
334 | } |
335 | |
336 | n = 0; |
337 | while (*s && *s >= L'0' && *s <= L'9') { |
338 | n *= 10; |
339 | n += (*s - '0'); |
340 | s++; |
341 | } |
342 | |
26c8c119 |
343 | return n * sign; |
d7482997 |
344 | } |
345 | |
dd567011 |
346 | double utof(wchar_t const *s) |
347 | { |
348 | char *cs = utoa_dup(s, CS_ASCII); |
349 | double ret = atof(cs); |
350 | sfree(cs); |
351 | return ret; |
352 | } |
353 | |
354 | int utob(wchar_t const *s) { |
d7482997 |
355 | if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") || |
356 | !ustricmp(s, L"true") || !ustricmp(s, L"t")) |
357 | return TRUE; |
358 | return FALSE; |
359 | } |
360 | |
361 | int uisdigit(wchar_t c) { |
362 | return c >= L'0' && c <= L'9'; |
363 | } |
364 | |
365 | #define USTRFTIME_DELTA 128 |
c8422236 |
366 | static void ustrftime_internal(rdstring *rs, char formatchr, |
367 | const struct tm *timespec) |
368 | { |
e4ea58f8 |
369 | /* |
d7482997 |
370 | * strftime has the entertaining property that it returns 0 |
371 | * _either_ on out-of-space _or_ on successful generation of |
372 | * the empty string. Hence we must ensure our format can never |
373 | * generate the empty string. Somebody throw a custard pie at |
374 | * whoever was responsible for that. Please? |
375 | */ |
c8422236 |
376 | |
377 | #ifdef HAS_WCSFTIME |
378 | wchar_t *buf = NULL; |
379 | wchar_t fmt[4]; |
380 | int size, ret; |
381 | |
382 | fmt[0] = L' '; |
383 | fmt[1] = L'%'; |
384 | /* Format chars are all ASCII, so conversion to Unicode is no problem */ |
385 | fmt[2] = formatchr; |
386 | fmt[3] = L'\0'; |
387 | |
388 | size = 0; |
389 | do { |
d7482997 |
390 | size += USTRFTIME_DELTA; |
f1530049 |
391 | buf = sresize(buf, size, wchar_t); |
c8422236 |
392 | ret = (int) wcsftime(buf, size, fmt, timespec); |
393 | } while (ret == 0); |
394 | |
395 | rdadds(rs, buf+1); |
396 | sfree(buf); |
397 | #else |
398 | char *buf = NULL; |
399 | wchar_t *cvtbuf; |
400 | char fmt[4]; |
401 | int size, ret; |
402 | |
403 | fmt[0] = ' '; |
404 | fmt[1] = '%'; |
405 | fmt[2] = formatchr; |
406 | fmt[3] = '\0'; |
407 | |
408 | size = 0; |
409 | do { |
410 | size += USTRFTIME_DELTA; |
f1530049 |
411 | buf = sresize(buf, size, char); |
c8422236 |
412 | ret = (int) strftime(buf, size, fmt, timespec); |
413 | } while (ret == 0); |
414 | |
415 | cvtbuf = ufroma_locale_dup(buf+1); |
416 | rdadds(rs, cvtbuf); |
417 | sfree(cvtbuf); |
418 | sfree(buf); |
419 | #endif |
420 | } |
421 | |
422 | wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec) |
423 | { |
424 | rdstring rs = { 0, 0, NULL }; |
425 | |
426 | if (!wfmt) |
427 | wfmt = L"%c"; |
428 | |
429 | while (*wfmt) { |
430 | if (wfmt[0] == L'%' && wfmt[1] == L'%') { |
431 | rdadd(&rs, L'%'); |
432 | wfmt += 2; |
433 | } else if (wfmt[0] == L'%' && wfmt[1]) { |
434 | ustrftime_internal(&rs, wfmt[1], timespec); |
435 | wfmt += 2; |
436 | } else { |
437 | rdadd(&rs, wfmt[0]); |
438 | wfmt++; |
439 | } |
d7482997 |
440 | } |
441 | |
c8422236 |
442 | return rdtrim(&rs); |
d7482997 |
443 | } |
91f93b94 |
444 | |
445 | /* |
446 | * Determine whether a Unicode string can be translated into a |
447 | * given charset without any missing characters. |
448 | */ |
449 | int cvt_ok(int charset, const wchar_t *s) |
450 | { |
451 | char buf[256]; |
452 | charset_state state = CHARSET_INIT_STATE; |
453 | int err, len = ustrlen(s); |
454 | |
455 | err = 0; |
456 | while (len > 0) { |
457 | (void)charset_from_unicode(&s, &len, buf, lenof(buf), |
458 | charset, &state, &err); |
459 | if (err) |
460 | return FALSE; |
461 | } |
462 | return TRUE; |
463 | } |
0960a3d8 |
464 | |
465 | /* |
466 | * Wrapper around charset_from_localenc which accepts the charset |
467 | * name as a wide string (since that happens to be more useful). |
468 | * Also throws a Halibut error and falls back to CS_ASCII if the |
469 | * charset is unrecognised, meaning the rest of the program can |
470 | * rely on always getting a valid charset id back from this |
471 | * function. |
472 | */ |
473 | int charset_from_ustr(filepos *fpos, const wchar_t *name) |
474 | { |
475 | char *csname; |
476 | int charset; |
477 | |
478 | csname = utoa_dup(name, CS_ASCII); |
479 | charset = charset_from_localenc(csname); |
480 | |
481 | if (charset == CS_NONE) { |
482 | charset = CS_ASCII; |
483 | error(err_charset, fpos, name); |
484 | } |
485 | |
486 | sfree(csname); |
487 | return charset; |
488 | } |