Add an error check for correct formatting in Deflate uncompressed
[sgt/halibut] / ustring.c
CommitLineData
d7482997 1/*
2 * ustring.c: Unicode string routines
3 */
4
5#include <wchar.h>
7e976207 6#include <stdlib.h>
7#include <assert.h>
d7482997 8#include <time.h>
9#include "halibut.h"
10
e4ea58f8 11wchar_t *ustrdup(wchar_t const *s) {
d7482997 12 wchar_t *r;
13 if (s) {
f1530049 14 r = snewn(1+ustrlen(s), wchar_t);
d7482997 15 ustrcpy(r, s);
16 } else {
f1530049 17 r = snew(wchar_t);
d7482997 18 *r = 0;
19 }
20 return r;
21}
22
e4ea58f8 23static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
24 int charset, int careful) {
25 int len, ret, err;
26 charset_state state = CHARSET_INIT_STATE;
27
d7482997 28 if (!s) {
29 *outbuf = '\0';
30 return outbuf;
31 }
e4ea58f8 32
33 len = ustrlen(s);
34 size--; /* leave room for terminating NUL */
35 *outbuf = '\0';
36 while (len > 0) {
37 err = 0;
38 ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
39 (careful ? &err : NULL));
40 if (err)
41 return NULL;
42 if (!ret)
43 return outbuf;
44 size -= ret;
45 outbuf += ret;
46 *outbuf = '\0';
47 }
48 /*
49 * Clean up
50 */
51 ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
52 size -= ret;
53 outbuf += ret;
54 *outbuf = '\0';
d7482997 55 return outbuf;
56}
57
e4ea58f8 58char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
59 return ustrtoa_internal(s, outbuf, size, charset, FALSE);
60}
61
62char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
63 return ustrtoa_internal(s, outbuf, size, charset, TRUE);
64}
65
66wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
67 int len, ret;
68 charset_state state = CHARSET_INIT_STATE;
69
ba9c1487 70 if (!s) {
71 *outbuf = L'\0';
72 return outbuf;
73 }
e4ea58f8 74
75 len = strlen(s);
76 size--; /* allow for terminating NUL */
77 *outbuf = L'\0';
78 while (len > 0) {
79 ret = charset_to_unicode(&s, &len, outbuf, size,
80 charset, &state, NULL, 0);
81 if (!ret)
82 return outbuf;
83 outbuf += ret;
84 size -= ret;
85 *outbuf = L'\0';
86 }
ba9c1487 87 return outbuf;
88}
89
e4ea58f8 90char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
91{
92 char *outbuf;
93 int outpos, outlen, len, ret, err;
94 charset_state state = CHARSET_INIT_STATE;
50d6b4bd 95
e4ea58f8 96 if (!s) {
97 return dupstr("");
98 }
50d6b4bd 99
e4ea58f8 100 len = ustrlen(s);
101
102 outlen = len + 10;
f1530049 103 outbuf = snewn(outlen, char);
e4ea58f8 104
105 outpos = 0;
106 outbuf[outpos] = '\0';
107
108 while (len > 0) {
109 err = 0;
110 ret = charset_from_unicode(&s, &len,
111 outbuf + outpos, outlen - outpos - 1,
112 charset, &state, (careful ? &err : NULL));
113 if (err) {
114 sfree(outbuf);
115 return NULL;
116 }
117 if (!ret) {
118 outlen = outlen * 3 / 2;
f1530049 119 outbuf = sresize(outbuf, outlen, char);
e4ea58f8 120 }
121 outpos += ret;
122 outbuf[outpos] = '\0';
123 }
124 /*
125 * Clean up
126 */
127 outlen = outpos + 32;
f1530049 128 outbuf = sresize(outbuf, outlen, char);
e4ea58f8 129 ret = charset_from_unicode(NULL, 0,
130 outbuf + outpos, outlen - outpos + 1,
131 charset, &state, NULL);
132 outpos += ret;
133 outbuf[outpos] = '\0';
134 if (lenp)
135 *lenp = outpos;
136 return outbuf;
50d6b4bd 137}
138
e4ea58f8 139char *utoa_dup(wchar_t const *s, int charset)
140{
141 return utoa_internal_dup(s, charset, NULL, FALSE);
142}
143
144char *utoa_dup_len(wchar_t const *s, int charset, int *len)
145{
146 return utoa_internal_dup(s, charset, len, FALSE);
147}
148
149char *utoa_careful_dup(wchar_t const *s, int charset)
150{
151 return utoa_internal_dup(s, charset, NULL, TRUE);
152}
153
154wchar_t *ufroma_dup(char const *s, int charset) {
ba9c1487 155 int len;
156 wchar_t *buf = NULL;
157
158 len = strlen(s) + 1;
159 do {
f1530049 160 buf = sresize(buf, len, wchar_t);
e4ea58f8 161 ustrfroma(s, buf, len, charset);
ba9c1487 162 len = (3 * len) / 2 + 1; /* this guarantees a strict increase */
163 } while (ustrlen(buf) >= len-1);
164
f1530049 165 buf = sresize(buf, ustrlen(buf)+1, wchar_t);
ba9c1487 166 return buf;
167}
168
7e976207 169char *utoa_locale_dup(wchar_t const *s)
170{
171 /*
172 * This variant uses the C library locale.
173 */
174 char *ret;
8281de1b 175 int len, outlen;
7e976207 176 size_t siz;
177
178 len = ustrlen(s);
179
8281de1b 180 outlen = 1 + MB_CUR_MAX * len;
181 ret = snewn(outlen+1, char);
7e976207 182
8281de1b 183 siz = wcstombs(ret, s, outlen);
7e976207 184
185 if (siz) {
8281de1b 186 assert(siz <= (size_t)(outlen));
7e976207 187 ret[siz] = '\0';
f1530049 188 ret = sresize(ret, siz+1, char);
7e976207 189 return ret;
190 }
191
192 /*
193 * If that failed, try a different strategy (which we will also
194 * attempt in the total absence of wcstombs). Retrieve the
195 * locale's charset from nl_langinfo or equivalent, and use
196 * normal utoa_dup.
197 */
198 return utoa_dup(s, charset_from_locale());
199}
200
201wchar_t *ufroma_locale_dup(char const *s)
202{
203 /*
204 * This variant uses the C library locale.
205 */
206 wchar_t *ret;
8281de1b 207 int len, outlen;
7e976207 208 size_t siz;
209
210 len = strlen(s);
211
8281de1b 212 outlen = 1 + 2*len;
213 ret = snewn(outlen+1, wchar_t); /* be conservative */
7e976207 214
8281de1b 215 siz = mbstowcs(ret, s, outlen);
7e976207 216
217 if (siz) {
8281de1b 218 assert(siz <= (size_t)(outlen));
7e976207 219 ret[siz] = L'\0';
f1530049 220 ret = sresize(ret, siz+1, wchar_t);
7e976207 221 return ret;
222 }
223
224 /*
225 * If that failed, try a different strategy (which we will also
226 * attempt in the total absence of wcstombs). Retrieve the
227 * locale's charset from nl_langinfo or equivalent, and use
228 * normal ufroma_dup.
229 */
230 return ufroma_dup(s, charset_from_locale());
231}
232
5dd44dce 233int ustrlen(wchar_t const *s) {
d7482997 234 int len = 0;
235 while (*s++) len++;
236 return len;
237}
238
239wchar_t *uadv(wchar_t *s) {
240 return s + 1 + ustrlen(s);
241}
242
5dd44dce 243wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
d7482997 244 wchar_t *ret = dest;
245 do {
246 *dest++ = *source;
247 } while (*source++);
248 return ret;
249}
250
08e78486 251wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) {
252 wchar_t *ret = dest;
253 do {
254 *dest++ = *source;
255 if (*source) source++;
256 } while (n-- > 0);
257 return ret;
258}
259
d7482997 260int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
261 if (!lhs && !rhs) return 0;
262 if (!lhs) return -1;
263 if (!rhs) return +1;
264 while (*lhs && *rhs && *lhs==*rhs)
265 lhs++, rhs++;
266 if (*lhs < *rhs)
267 return -1;
268 else if (*lhs > *rhs)
269 return 1;
270 return 0;
271}
272
273wchar_t utolower(wchar_t c) {
274 if (c == L'\0')
275 return c; /* this property needed by ustricmp */
9badd775 276#ifdef HAS_TOWLOWER
277 return towlower(c);
278#else
d7482997 279 if (c >= 'A' && c <= 'Z')
280 c += 'a'-'A';
281 return c;
9badd775 282#endif
d7482997 283}
284
831da32e 285int uisalpha(wchar_t c) {
9badd775 286#ifdef HAS_ISWALPHA
287 return iswalpha(c);
288#else
831da32e 289 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
9badd775 290#endif
831da32e 291}
292
78c73085 293int ustricmp(wchar_t const *lhs, wchar_t const *rhs) {
d7482997 294 wchar_t lc, rc;
295 while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
296 lhs++, rhs++;
297 if (!lc && !rc)
298 return 0;
299 if (lc < rc)
300 return -1;
301 else
302 return 1;
303}
304
78c73085 305int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) {
306 wchar_t lc = 0, rc = 0;
307 while (maxlen-- > 0 &&
308 (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
309 lhs++, rhs++;
310 if (lc < rc)
311 return -1;
312 else if (lc > rc)
313 return 1;
314 else
315 return 0;
316}
317
d7482997 318wchar_t *ustrlow(wchar_t *s) {
319 wchar_t *p = s;
320 while (*p) {
321 *p = utolower(*p);
322 p++;
323 }
324 return s;
325}
326
dd567011 327int utoi(wchar_t const *s) {
d7482997 328 int sign = +1;
329 int n;
330
331 if (*s == L'-') {
332 s++;
333 sign = -1;
334 }
335
336 n = 0;
337 while (*s && *s >= L'0' && *s <= L'9') {
338 n *= 10;
339 n += (*s - '0');
340 s++;
341 }
342
26c8c119 343 return n * sign;
d7482997 344}
345
dd567011 346double utof(wchar_t const *s)
347{
348 char *cs = utoa_dup(s, CS_ASCII);
349 double ret = atof(cs);
350 sfree(cs);
351 return ret;
352}
353
354int utob(wchar_t const *s) {
d7482997 355 if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
356 !ustricmp(s, L"true") || !ustricmp(s, L"t"))
357 return TRUE;
358 return FALSE;
359}
360
361int uisdigit(wchar_t c) {
362 return c >= L'0' && c <= L'9';
363}
364
365#define USTRFTIME_DELTA 128
c8422236 366static void ustrftime_internal(rdstring *rs, char formatchr,
367 const struct tm *timespec)
368{
e4ea58f8 369 /*
d7482997 370 * strftime has the entertaining property that it returns 0
371 * _either_ on out-of-space _or_ on successful generation of
372 * the empty string. Hence we must ensure our format can never
373 * generate the empty string. Somebody throw a custard pie at
374 * whoever was responsible for that. Please?
375 */
c8422236 376
377#ifdef HAS_WCSFTIME
378 wchar_t *buf = NULL;
379 wchar_t fmt[4];
380 int size, ret;
381
382 fmt[0] = L' ';
383 fmt[1] = L'%';
384 /* Format chars are all ASCII, so conversion to Unicode is no problem */
385 fmt[2] = formatchr;
386 fmt[3] = L'\0';
387
388 size = 0;
389 do {
d7482997 390 size += USTRFTIME_DELTA;
f1530049 391 buf = sresize(buf, size, wchar_t);
c8422236 392 ret = (int) wcsftime(buf, size, fmt, timespec);
393 } while (ret == 0);
394
395 rdadds(rs, buf+1);
396 sfree(buf);
397#else
398 char *buf = NULL;
399 wchar_t *cvtbuf;
400 char fmt[4];
401 int size, ret;
402
403 fmt[0] = ' ';
404 fmt[1] = '%';
405 fmt[2] = formatchr;
406 fmt[3] = '\0';
407
408 size = 0;
409 do {
410 size += USTRFTIME_DELTA;
f1530049 411 buf = sresize(buf, size, char);
c8422236 412 ret = (int) strftime(buf, size, fmt, timespec);
413 } while (ret == 0);
414
415 cvtbuf = ufroma_locale_dup(buf+1);
416 rdadds(rs, cvtbuf);
417 sfree(cvtbuf);
418 sfree(buf);
419#endif
420}
421
422wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec)
423{
424 rdstring rs = { 0, 0, NULL };
425
426 if (!wfmt)
427 wfmt = L"%c";
428
429 while (*wfmt) {
430 if (wfmt[0] == L'%' && wfmt[1] == L'%') {
431 rdadd(&rs, L'%');
432 wfmt += 2;
433 } else if (wfmt[0] == L'%' && wfmt[1]) {
434 ustrftime_internal(&rs, wfmt[1], timespec);
435 wfmt += 2;
436 } else {
437 rdadd(&rs, wfmt[0]);
438 wfmt++;
439 }
d7482997 440 }
441
c8422236 442 return rdtrim(&rs);
d7482997 443}
91f93b94 444
445/*
446 * Determine whether a Unicode string can be translated into a
447 * given charset without any missing characters.
448 */
449int cvt_ok(int charset, const wchar_t *s)
450{
451 char buf[256];
452 charset_state state = CHARSET_INIT_STATE;
453 int err, len = ustrlen(s);
454
455 err = 0;
456 while (len > 0) {
457 (void)charset_from_unicode(&s, &len, buf, lenof(buf),
458 charset, &state, &err);
459 if (err)
460 return FALSE;
461 }
462 return TRUE;
463}
0960a3d8 464
465/*
466 * Wrapper around charset_from_localenc which accepts the charset
467 * name as a wide string (since that happens to be more useful).
468 * Also throws a Halibut error and falls back to CS_ASCII if the
469 * charset is unrecognised, meaning the rest of the program can
470 * rely on always getting a valid charset id back from this
471 * function.
472 */
473int charset_from_ustr(filepos *fpos, const wchar_t *name)
474{
475 char *csname;
476 int charset;
477
478 csname = utoa_dup(name, CS_ASCII);
479 charset = charset_from_localenc(csname);
480
481 if (charset == CS_NONE) {
482 charset = CS_ASCII;
483 error(err_charset, fpos, name);
484 }
485
486 sfree(csname);
487 return charset;
488}