d7482997 |
1 | /* |
2 | * ustring.c: Unicode string routines |
3 | */ |
4 | |
5 | #include <wchar.h> |
7e976207 |
6 | #include <stdlib.h> |
7 | #include <assert.h> |
d7482997 |
8 | #include <time.h> |
9 | #include "halibut.h" |
10 | |
e4ea58f8 |
11 | wchar_t *ustrdup(wchar_t const *s) { |
d7482997 |
12 | wchar_t *r; |
13 | if (s) { |
14 | r = mknewa(wchar_t, 1+ustrlen(s)); |
15 | ustrcpy(r, s); |
16 | } else { |
17 | r = mknew(wchar_t); |
18 | *r = 0; |
19 | } |
20 | return r; |
21 | } |
22 | |
e4ea58f8 |
23 | static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size, |
24 | int charset, int careful) { |
25 | int len, ret, err; |
26 | charset_state state = CHARSET_INIT_STATE; |
27 | |
d7482997 |
28 | if (!s) { |
29 | *outbuf = '\0'; |
30 | return outbuf; |
31 | } |
e4ea58f8 |
32 | |
33 | len = ustrlen(s); |
34 | size--; /* leave room for terminating NUL */ |
35 | *outbuf = '\0'; |
36 | while (len > 0) { |
37 | err = 0; |
38 | ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state, |
39 | (careful ? &err : NULL)); |
40 | if (err) |
41 | return NULL; |
42 | if (!ret) |
43 | return outbuf; |
44 | size -= ret; |
45 | outbuf += ret; |
46 | *outbuf = '\0'; |
47 | } |
48 | /* |
49 | * Clean up |
50 | */ |
51 | ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL); |
52 | size -= ret; |
53 | outbuf += ret; |
54 | *outbuf = '\0'; |
d7482997 |
55 | return outbuf; |
56 | } |
57 | |
e4ea58f8 |
58 | char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) { |
59 | return ustrtoa_internal(s, outbuf, size, charset, FALSE); |
60 | } |
61 | |
62 | char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) { |
63 | return ustrtoa_internal(s, outbuf, size, charset, TRUE); |
64 | } |
65 | |
66 | wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) { |
67 | int len, ret; |
68 | charset_state state = CHARSET_INIT_STATE; |
69 | |
ba9c1487 |
70 | if (!s) { |
71 | *outbuf = L'\0'; |
72 | return outbuf; |
73 | } |
e4ea58f8 |
74 | |
75 | len = strlen(s); |
76 | size--; /* allow for terminating NUL */ |
77 | *outbuf = L'\0'; |
78 | while (len > 0) { |
79 | ret = charset_to_unicode(&s, &len, outbuf, size, |
80 | charset, &state, NULL, 0); |
81 | if (!ret) |
82 | return outbuf; |
83 | outbuf += ret; |
84 | size -= ret; |
85 | *outbuf = L'\0'; |
86 | } |
ba9c1487 |
87 | return outbuf; |
88 | } |
89 | |
e4ea58f8 |
90 | char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) |
91 | { |
92 | char *outbuf; |
93 | int outpos, outlen, len, ret, err; |
94 | charset_state state = CHARSET_INIT_STATE; |
50d6b4bd |
95 | |
e4ea58f8 |
96 | if (!s) { |
97 | return dupstr(""); |
98 | } |
50d6b4bd |
99 | |
e4ea58f8 |
100 | len = ustrlen(s); |
101 | |
102 | outlen = len + 10; |
103 | outbuf = mknewa(char, outlen); |
104 | |
105 | outpos = 0; |
106 | outbuf[outpos] = '\0'; |
107 | |
108 | while (len > 0) { |
109 | err = 0; |
110 | ret = charset_from_unicode(&s, &len, |
111 | outbuf + outpos, outlen - outpos - 1, |
112 | charset, &state, (careful ? &err : NULL)); |
113 | if (err) { |
114 | sfree(outbuf); |
115 | return NULL; |
116 | } |
117 | if (!ret) { |
118 | outlen = outlen * 3 / 2; |
119 | outbuf = resize(outbuf, outlen); |
120 | } |
121 | outpos += ret; |
122 | outbuf[outpos] = '\0'; |
123 | } |
124 | /* |
125 | * Clean up |
126 | */ |
127 | outlen = outpos + 32; |
128 | outbuf = resize(outbuf, outlen); |
129 | ret = charset_from_unicode(NULL, 0, |
130 | outbuf + outpos, outlen - outpos + 1, |
131 | charset, &state, NULL); |
132 | outpos += ret; |
133 | outbuf[outpos] = '\0'; |
134 | if (lenp) |
135 | *lenp = outpos; |
136 | return outbuf; |
50d6b4bd |
137 | } |
138 | |
e4ea58f8 |
139 | char *utoa_dup(wchar_t const *s, int charset) |
140 | { |
141 | return utoa_internal_dup(s, charset, NULL, FALSE); |
142 | } |
143 | |
144 | char *utoa_dup_len(wchar_t const *s, int charset, int *len) |
145 | { |
146 | return utoa_internal_dup(s, charset, len, FALSE); |
147 | } |
148 | |
149 | char *utoa_careful_dup(wchar_t const *s, int charset) |
150 | { |
151 | return utoa_internal_dup(s, charset, NULL, TRUE); |
152 | } |
153 | |
154 | wchar_t *ufroma_dup(char const *s, int charset) { |
ba9c1487 |
155 | int len; |
156 | wchar_t *buf = NULL; |
157 | |
158 | len = strlen(s) + 1; |
159 | do { |
160 | buf = resize(buf, len); |
e4ea58f8 |
161 | ustrfroma(s, buf, len, charset); |
ba9c1487 |
162 | len = (3 * len) / 2 + 1; /* this guarantees a strict increase */ |
163 | } while (ustrlen(buf) >= len-1); |
164 | |
165 | buf = resize(buf, ustrlen(buf)+1); |
166 | return buf; |
167 | } |
168 | |
7e976207 |
169 | char *utoa_locale_dup(wchar_t const *s) |
170 | { |
171 | /* |
172 | * This variant uses the C library locale. |
173 | */ |
174 | char *ret; |
175 | int len; |
176 | size_t siz; |
177 | |
178 | len = ustrlen(s); |
179 | |
180 | ret = mknewa(char, 1 + MB_CUR_MAX * len); |
181 | |
182 | siz = wcstombs(ret, s, len); |
183 | |
184 | if (siz) { |
185 | assert(siz <= MB_CUR_MAX * len); |
186 | ret[siz] = '\0'; |
187 | ret = resize(ret, siz+1); |
188 | return ret; |
189 | } |
190 | |
191 | /* |
192 | * If that failed, try a different strategy (which we will also |
193 | * attempt in the total absence of wcstombs). Retrieve the |
194 | * locale's charset from nl_langinfo or equivalent, and use |
195 | * normal utoa_dup. |
196 | */ |
197 | return utoa_dup(s, charset_from_locale()); |
198 | } |
199 | |
200 | wchar_t *ufroma_locale_dup(char const *s) |
201 | { |
202 | /* |
203 | * This variant uses the C library locale. |
204 | */ |
205 | wchar_t *ret; |
206 | int len; |
207 | size_t siz; |
208 | |
209 | len = strlen(s); |
210 | |
211 | ret = mknewa(wchar_t, 1 + 2*len); /* be conservative */ |
212 | |
213 | siz = mbstowcs(ret, s, len); |
214 | |
215 | if (siz) { |
216 | assert(siz <= (size_t)(2 * len)); |
217 | ret[siz] = L'\0'; |
218 | ret = resize(ret, siz+1); |
219 | return ret; |
220 | } |
221 | |
222 | /* |
223 | * If that failed, try a different strategy (which we will also |
224 | * attempt in the total absence of wcstombs). Retrieve the |
225 | * locale's charset from nl_langinfo or equivalent, and use |
226 | * normal ufroma_dup. |
227 | */ |
228 | return ufroma_dup(s, charset_from_locale()); |
229 | } |
230 | |
5dd44dce |
231 | int ustrlen(wchar_t const *s) { |
d7482997 |
232 | int len = 0; |
233 | while (*s++) len++; |
234 | return len; |
235 | } |
236 | |
237 | wchar_t *uadv(wchar_t *s) { |
238 | return s + 1 + ustrlen(s); |
239 | } |
240 | |
5dd44dce |
241 | wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) { |
d7482997 |
242 | wchar_t *ret = dest; |
243 | do { |
244 | *dest++ = *source; |
245 | } while (*source++); |
246 | return ret; |
247 | } |
248 | |
249 | int ustrcmp(wchar_t *lhs, wchar_t *rhs) { |
250 | if (!lhs && !rhs) return 0; |
251 | if (!lhs) return -1; |
252 | if (!rhs) return +1; |
253 | while (*lhs && *rhs && *lhs==*rhs) |
254 | lhs++, rhs++; |
255 | if (*lhs < *rhs) |
256 | return -1; |
257 | else if (*lhs > *rhs) |
258 | return 1; |
259 | return 0; |
260 | } |
261 | |
262 | wchar_t utolower(wchar_t c) { |
263 | if (c == L'\0') |
264 | return c; /* this property needed by ustricmp */ |
265 | /* FIXME: this doesn't even come close */ |
266 | if (c >= 'A' && c <= 'Z') |
267 | c += 'a'-'A'; |
268 | return c; |
269 | } |
270 | |
831da32e |
271 | int uisalpha(wchar_t c) { |
272 | /* FIXME: this doesn't even come close */ |
273 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
274 | } |
275 | |
d7482997 |
276 | int ustricmp(wchar_t *lhs, wchar_t *rhs) { |
277 | wchar_t lc, rc; |
278 | while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
279 | lhs++, rhs++; |
280 | if (!lc && !rc) |
281 | return 0; |
282 | if (lc < rc) |
283 | return -1; |
284 | else |
285 | return 1; |
286 | } |
287 | |
288 | wchar_t *ustrlow(wchar_t *s) { |
289 | wchar_t *p = s; |
290 | while (*p) { |
291 | *p = utolower(*p); |
292 | p++; |
293 | } |
294 | return s; |
295 | } |
296 | |
297 | int utoi(wchar_t *s) { |
298 | int sign = +1; |
299 | int n; |
300 | |
301 | if (*s == L'-') { |
302 | s++; |
303 | sign = -1; |
304 | } |
305 | |
306 | n = 0; |
307 | while (*s && *s >= L'0' && *s <= L'9') { |
308 | n *= 10; |
309 | n += (*s - '0'); |
310 | s++; |
311 | } |
312 | |
313 | return n; |
314 | } |
315 | |
316 | int utob(wchar_t *s) { |
317 | if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") || |
318 | !ustricmp(s, L"true") || !ustricmp(s, L"t")) |
319 | return TRUE; |
320 | return FALSE; |
321 | } |
322 | |
323 | int uisdigit(wchar_t c) { |
324 | return c >= L'0' && c <= L'9'; |
325 | } |
326 | |
327 | #define USTRFTIME_DELTA 128 |
328 | wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) { |
329 | void *blk = NULL; |
330 | wchar_t *wblk, *wp; |
331 | char *fmt, *text, *p; |
332 | size_t size = 0; |
333 | size_t len; |
334 | |
335 | /* |
e4ea58f8 |
336 | * FIXME: really we ought to copy non-% parts of the format |
337 | * ourselves, and only resort to strftime for % parts. Also we |
338 | * should use wcsftime if it's present. |
339 | */ |
340 | |
341 | /* |
d7482997 |
342 | * strftime has the entertaining property that it returns 0 |
343 | * _either_ on out-of-space _or_ on successful generation of |
344 | * the empty string. Hence we must ensure our format can never |
345 | * generate the empty string. Somebody throw a custard pie at |
346 | * whoever was responsible for that. Please? |
347 | */ |
348 | if (wfmt) { |
349 | len = ustrlen(wfmt); |
350 | fmt = mknewa(char, 2+len); |
e4ea58f8 |
351 | ustrtoa(wfmt, fmt+1, len+1, CS_ASCII); /* CS_FIXME? */ |
d7482997 |
352 | fmt[0] = ' '; |
353 | } else |
354 | fmt = " %c"; |
355 | |
356 | while (1) { |
357 | size += USTRFTIME_DELTA; |
358 | blk = resize((char *)blk, size); |
359 | len = strftime((char *)blk, size-1, fmt, timespec); |
360 | if (len > 0) |
361 | break; |
362 | } |
363 | |
364 | /* Note: +1 for the terminating 0, -1 for the initial space in fmt */ |
365 | wblk = resize((wchar_t *)blk, len); |
366 | text = mknewa(char, len); |
367 | strftime(text, len, fmt+1, timespec); |
368 | /* |
369 | * We operate in the C locale, so this all ought to be kosher |
370 | * ASCII. If we ever move outside ASCII machines, we may need |
371 | * to make this more portable... |
372 | */ |
373 | for (wp = wblk, p = text; *p; p++, wp++) |
374 | *wp = *p; |
375 | *wp = 0; |
376 | if (wfmt) |
377 | sfree(fmt); |
378 | sfree(text); |
379 | return wblk; |
380 | } |
91f93b94 |
381 | |
382 | /* |
383 | * Determine whether a Unicode string can be translated into a |
384 | * given charset without any missing characters. |
385 | */ |
386 | int cvt_ok(int charset, const wchar_t *s) |
387 | { |
388 | char buf[256]; |
389 | charset_state state = CHARSET_INIT_STATE; |
390 | int err, len = ustrlen(s); |
391 | |
392 | err = 0; |
393 | while (len > 0) { |
394 | (void)charset_from_unicode(&s, &len, buf, lenof(buf), |
395 | charset, &state, &err); |
396 | if (err) |
397 | return FALSE; |
398 | } |
399 | return TRUE; |
400 | } |