Bug in utoi(), which made it ignore a leading minus sign when
[sgt/halibut] / ustring.c
1 /*
2 * ustring.c: Unicode string routines
3 */
4
5 #include <wchar.h>
6 #include <stdlib.h>
7 #include <assert.h>
8 #include <time.h>
9 #include "halibut.h"
10
11 wchar_t *ustrdup(wchar_t const *s) {
12 wchar_t *r;
13 if (s) {
14 r = snewn(1+ustrlen(s), wchar_t);
15 ustrcpy(r, s);
16 } else {
17 r = snew(wchar_t);
18 *r = 0;
19 }
20 return r;
21 }
22
23 static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
24 int charset, int careful) {
25 int len, ret, err;
26 charset_state state = CHARSET_INIT_STATE;
27
28 if (!s) {
29 *outbuf = '\0';
30 return outbuf;
31 }
32
33 len = ustrlen(s);
34 size--; /* leave room for terminating NUL */
35 *outbuf = '\0';
36 while (len > 0) {
37 err = 0;
38 ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
39 (careful ? &err : NULL));
40 if (err)
41 return NULL;
42 if (!ret)
43 return outbuf;
44 size -= ret;
45 outbuf += ret;
46 *outbuf = '\0';
47 }
48 /*
49 * Clean up
50 */
51 ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
52 size -= ret;
53 outbuf += ret;
54 *outbuf = '\0';
55 return outbuf;
56 }
57
58 char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
59 return ustrtoa_internal(s, outbuf, size, charset, FALSE);
60 }
61
62 char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
63 return ustrtoa_internal(s, outbuf, size, charset, TRUE);
64 }
65
66 wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
67 int len, ret;
68 charset_state state = CHARSET_INIT_STATE;
69
70 if (!s) {
71 *outbuf = L'\0';
72 return outbuf;
73 }
74
75 len = strlen(s);
76 size--; /* allow for terminating NUL */
77 *outbuf = L'\0';
78 while (len > 0) {
79 ret = charset_to_unicode(&s, &len, outbuf, size,
80 charset, &state, NULL, 0);
81 if (!ret)
82 return outbuf;
83 outbuf += ret;
84 size -= ret;
85 *outbuf = L'\0';
86 }
87 return outbuf;
88 }
89
90 char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
91 {
92 char *outbuf;
93 int outpos, outlen, len, ret, err;
94 charset_state state = CHARSET_INIT_STATE;
95
96 if (!s) {
97 return dupstr("");
98 }
99
100 len = ustrlen(s);
101
102 outlen = len + 10;
103 outbuf = snewn(outlen, char);
104
105 outpos = 0;
106 outbuf[outpos] = '\0';
107
108 while (len > 0) {
109 err = 0;
110 ret = charset_from_unicode(&s, &len,
111 outbuf + outpos, outlen - outpos - 1,
112 charset, &state, (careful ? &err : NULL));
113 if (err) {
114 sfree(outbuf);
115 return NULL;
116 }
117 if (!ret) {
118 outlen = outlen * 3 / 2;
119 outbuf = sresize(outbuf, outlen, char);
120 }
121 outpos += ret;
122 outbuf[outpos] = '\0';
123 }
124 /*
125 * Clean up
126 */
127 outlen = outpos + 32;
128 outbuf = sresize(outbuf, outlen, char);
129 ret = charset_from_unicode(NULL, 0,
130 outbuf + outpos, outlen - outpos + 1,
131 charset, &state, NULL);
132 outpos += ret;
133 outbuf[outpos] = '\0';
134 if (lenp)
135 *lenp = outpos;
136 return outbuf;
137 }
138
139 char *utoa_dup(wchar_t const *s, int charset)
140 {
141 return utoa_internal_dup(s, charset, NULL, FALSE);
142 }
143
144 char *utoa_dup_len(wchar_t const *s, int charset, int *len)
145 {
146 return utoa_internal_dup(s, charset, len, FALSE);
147 }
148
149 char *utoa_careful_dup(wchar_t const *s, int charset)
150 {
151 return utoa_internal_dup(s, charset, NULL, TRUE);
152 }
153
154 wchar_t *ufroma_dup(char const *s, int charset) {
155 int len;
156 wchar_t *buf = NULL;
157
158 len = strlen(s) + 1;
159 do {
160 buf = sresize(buf, len, wchar_t);
161 ustrfroma(s, buf, len, charset);
162 len = (3 * len) / 2 + 1; /* this guarantees a strict increase */
163 } while (ustrlen(buf) >= len-1);
164
165 buf = sresize(buf, ustrlen(buf)+1, wchar_t);
166 return buf;
167 }
168
169 char *utoa_locale_dup(wchar_t const *s)
170 {
171 /*
172 * This variant uses the C library locale.
173 */
174 char *ret;
175 int len, outlen;
176 size_t siz;
177
178 len = ustrlen(s);
179
180 outlen = 1 + MB_CUR_MAX * len;
181 ret = snewn(outlen+1, char);
182
183 siz = wcstombs(ret, s, outlen);
184
185 if (siz) {
186 assert(siz <= (size_t)(outlen));
187 ret[siz] = '\0';
188 ret = sresize(ret, siz+1, char);
189 return ret;
190 }
191
192 /*
193 * If that failed, try a different strategy (which we will also
194 * attempt in the total absence of wcstombs). Retrieve the
195 * locale's charset from nl_langinfo or equivalent, and use
196 * normal utoa_dup.
197 */
198 return utoa_dup(s, charset_from_locale());
199 }
200
201 wchar_t *ufroma_locale_dup(char const *s)
202 {
203 /*
204 * This variant uses the C library locale.
205 */
206 wchar_t *ret;
207 int len, outlen;
208 size_t siz;
209
210 len = strlen(s);
211
212 outlen = 1 + 2*len;
213 ret = snewn(outlen+1, wchar_t); /* be conservative */
214
215 siz = mbstowcs(ret, s, outlen);
216
217 if (siz) {
218 assert(siz <= (size_t)(outlen));
219 ret[siz] = L'\0';
220 ret = sresize(ret, siz+1, wchar_t);
221 return ret;
222 }
223
224 /*
225 * If that failed, try a different strategy (which we will also
226 * attempt in the total absence of wcstombs). Retrieve the
227 * locale's charset from nl_langinfo or equivalent, and use
228 * normal ufroma_dup.
229 */
230 return ufroma_dup(s, charset_from_locale());
231 }
232
233 int ustrlen(wchar_t const *s) {
234 int len = 0;
235 while (*s++) len++;
236 return len;
237 }
238
239 wchar_t *uadv(wchar_t *s) {
240 return s + 1 + ustrlen(s);
241 }
242
243 wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
244 wchar_t *ret = dest;
245 do {
246 *dest++ = *source;
247 } while (*source++);
248 return ret;
249 }
250
251 wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) {
252 wchar_t *ret = dest;
253 do {
254 *dest++ = *source;
255 if (*source) source++;
256 } while (n-- > 0);
257 return ret;
258 }
259
260 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
261 if (!lhs && !rhs) return 0;
262 if (!lhs) return -1;
263 if (!rhs) return +1;
264 while (*lhs && *rhs && *lhs==*rhs)
265 lhs++, rhs++;
266 if (*lhs < *rhs)
267 return -1;
268 else if (*lhs > *rhs)
269 return 1;
270 return 0;
271 }
272
273 wchar_t utolower(wchar_t c) {
274 if (c == L'\0')
275 return c; /* this property needed by ustricmp */
276 #ifdef HAS_TOWLOWER
277 return towlower(c);
278 #else
279 if (c >= 'A' && c <= 'Z')
280 c += 'a'-'A';
281 return c;
282 #endif
283 }
284
285 int uisalpha(wchar_t c) {
286 #ifdef HAS_ISWALPHA
287 return iswalpha(c);
288 #else
289 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
290 #endif
291 }
292
293 int ustricmp(wchar_t const *lhs, wchar_t const *rhs) {
294 wchar_t lc, rc;
295 while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
296 lhs++, rhs++;
297 if (!lc && !rc)
298 return 0;
299 if (lc < rc)
300 return -1;
301 else
302 return 1;
303 }
304
305 int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) {
306 wchar_t lc = 0, rc = 0;
307 while (maxlen-- > 0 &&
308 (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
309 lhs++, rhs++;
310 if (lc < rc)
311 return -1;
312 else if (lc > rc)
313 return 1;
314 else
315 return 0;
316 }
317
318 wchar_t *ustrlow(wchar_t *s) {
319 wchar_t *p = s;
320 while (*p) {
321 *p = utolower(*p);
322 p++;
323 }
324 return s;
325 }
326
327 int utoi(wchar_t const *s) {
328 int sign = +1;
329 int n;
330
331 if (*s == L'-') {
332 s++;
333 sign = -1;
334 }
335
336 n = 0;
337 while (*s && *s >= L'0' && *s <= L'9') {
338 n *= 10;
339 n += (*s - '0');
340 s++;
341 }
342
343 return n * sign;
344 }
345
346 double utof(wchar_t const *s)
347 {
348 char *cs = utoa_dup(s, CS_ASCII);
349 double ret = atof(cs);
350 sfree(cs);
351 return ret;
352 }
353
354 int utob(wchar_t const *s) {
355 if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
356 !ustricmp(s, L"true") || !ustricmp(s, L"t"))
357 return TRUE;
358 return FALSE;
359 }
360
361 int uisdigit(wchar_t c) {
362 return c >= L'0' && c <= L'9';
363 }
364
365 #define USTRFTIME_DELTA 128
366 static void ustrftime_internal(rdstring *rs, char formatchr,
367 const struct tm *timespec)
368 {
369 /*
370 * strftime has the entertaining property that it returns 0
371 * _either_ on out-of-space _or_ on successful generation of
372 * the empty string. Hence we must ensure our format can never
373 * generate the empty string. Somebody throw a custard pie at
374 * whoever was responsible for that. Please?
375 */
376
377 #ifdef HAS_WCSFTIME
378 wchar_t *buf = NULL;
379 wchar_t fmt[4];
380 int size, ret;
381
382 fmt[0] = L' ';
383 fmt[1] = L'%';
384 /* Format chars are all ASCII, so conversion to Unicode is no problem */
385 fmt[2] = formatchr;
386 fmt[3] = L'\0';
387
388 size = 0;
389 do {
390 size += USTRFTIME_DELTA;
391 buf = sresize(buf, size, wchar_t);
392 ret = (int) wcsftime(buf, size, fmt, timespec);
393 } while (ret == 0);
394
395 rdadds(rs, buf+1);
396 sfree(buf);
397 #else
398 char *buf = NULL;
399 wchar_t *cvtbuf;
400 char fmt[4];
401 int size, ret;
402
403 fmt[0] = ' ';
404 fmt[1] = '%';
405 fmt[2] = formatchr;
406 fmt[3] = '\0';
407
408 size = 0;
409 do {
410 size += USTRFTIME_DELTA;
411 buf = sresize(buf, size, char);
412 ret = (int) strftime(buf, size, fmt, timespec);
413 } while (ret == 0);
414
415 cvtbuf = ufroma_locale_dup(buf+1);
416 rdadds(rs, cvtbuf);
417 sfree(cvtbuf);
418 sfree(buf);
419 #endif
420 }
421
422 wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec)
423 {
424 rdstring rs = { 0, 0, NULL };
425
426 if (!wfmt)
427 wfmt = L"%c";
428
429 while (*wfmt) {
430 if (wfmt[0] == L'%' && wfmt[1] == L'%') {
431 rdadd(&rs, L'%');
432 wfmt += 2;
433 } else if (wfmt[0] == L'%' && wfmt[1]) {
434 ustrftime_internal(&rs, wfmt[1], timespec);
435 wfmt += 2;
436 } else {
437 rdadd(&rs, wfmt[0]);
438 wfmt++;
439 }
440 }
441
442 return rdtrim(&rs);
443 }
444
445 /*
446 * Determine whether a Unicode string can be translated into a
447 * given charset without any missing characters.
448 */
449 int cvt_ok(int charset, const wchar_t *s)
450 {
451 char buf[256];
452 charset_state state = CHARSET_INIT_STATE;
453 int err, len = ustrlen(s);
454
455 err = 0;
456 while (len > 0) {
457 (void)charset_from_unicode(&s, &len, buf, lenof(buf),
458 charset, &state, &err);
459 if (err)
460 return FALSE;
461 }
462 return TRUE;
463 }
464
465 /*
466 * Wrapper around charset_from_localenc which accepts the charset
467 * name as a wide string (since that happens to be more useful).
468 * Also throws a Halibut error and falls back to CS_ASCII if the
469 * charset is unrecognised, meaning the rest of the program can
470 * rely on always getting a valid charset id back from this
471 * function.
472 */
473 int charset_from_ustr(filepos *fpos, const wchar_t *name)
474 {
475 char *csname;
476 int charset;
477
478 csname = utoa_dup(name, CS_ASCII);
479 charset = charset_from_localenc(csname);
480
481 if (charset == CS_NONE) {
482 charset = CS_ASCII;
483 error(err_charset, fpos, name);
484 }
485
486 sfree(csname);
487 return charset;
488 }