Initial checkin of the shiny new rewritten-from-scratch HTML back
[sgt/halibut] / ustring.c
1 /*
2 * ustring.c: Unicode string routines
3 */
4
5 #include <wchar.h>
6 #include <stdlib.h>
7 #include <assert.h>
8 #include <time.h>
9 #include "halibut.h"
10
11 wchar_t *ustrdup(wchar_t const *s) {
12 wchar_t *r;
13 if (s) {
14 r = mknewa(wchar_t, 1+ustrlen(s));
15 ustrcpy(r, s);
16 } else {
17 r = mknew(wchar_t);
18 *r = 0;
19 }
20 return r;
21 }
22
23 static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
24 int charset, int careful) {
25 int len, ret, err;
26 charset_state state = CHARSET_INIT_STATE;
27
28 if (!s) {
29 *outbuf = '\0';
30 return outbuf;
31 }
32
33 len = ustrlen(s);
34 size--; /* leave room for terminating NUL */
35 *outbuf = '\0';
36 while (len > 0) {
37 err = 0;
38 ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
39 (careful ? &err : NULL));
40 if (err)
41 return NULL;
42 if (!ret)
43 return outbuf;
44 size -= ret;
45 outbuf += ret;
46 *outbuf = '\0';
47 }
48 /*
49 * Clean up
50 */
51 ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
52 size -= ret;
53 outbuf += ret;
54 *outbuf = '\0';
55 return outbuf;
56 }
57
58 char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
59 return ustrtoa_internal(s, outbuf, size, charset, FALSE);
60 }
61
62 char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
63 return ustrtoa_internal(s, outbuf, size, charset, TRUE);
64 }
65
66 wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
67 int len, ret;
68 charset_state state = CHARSET_INIT_STATE;
69
70 if (!s) {
71 *outbuf = L'\0';
72 return outbuf;
73 }
74
75 len = strlen(s);
76 size--; /* allow for terminating NUL */
77 *outbuf = L'\0';
78 while (len > 0) {
79 ret = charset_to_unicode(&s, &len, outbuf, size,
80 charset, &state, NULL, 0);
81 if (!ret)
82 return outbuf;
83 outbuf += ret;
84 size -= ret;
85 *outbuf = L'\0';
86 }
87 return outbuf;
88 }
89
90 char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
91 {
92 char *outbuf;
93 int outpos, outlen, len, ret, err;
94 charset_state state = CHARSET_INIT_STATE;
95
96 if (!s) {
97 return dupstr("");
98 }
99
100 len = ustrlen(s);
101
102 outlen = len + 10;
103 outbuf = mknewa(char, outlen);
104
105 outpos = 0;
106 outbuf[outpos] = '\0';
107
108 while (len > 0) {
109 err = 0;
110 ret = charset_from_unicode(&s, &len,
111 outbuf + outpos, outlen - outpos - 1,
112 charset, &state, (careful ? &err : NULL));
113 if (err) {
114 sfree(outbuf);
115 return NULL;
116 }
117 if (!ret) {
118 outlen = outlen * 3 / 2;
119 outbuf = resize(outbuf, outlen);
120 }
121 outpos += ret;
122 outbuf[outpos] = '\0';
123 }
124 /*
125 * Clean up
126 */
127 outlen = outpos + 32;
128 outbuf = resize(outbuf, outlen);
129 ret = charset_from_unicode(NULL, 0,
130 outbuf + outpos, outlen - outpos + 1,
131 charset, &state, NULL);
132 outpos += ret;
133 outbuf[outpos] = '\0';
134 if (lenp)
135 *lenp = outpos;
136 return outbuf;
137 }
138
139 char *utoa_dup(wchar_t const *s, int charset)
140 {
141 return utoa_internal_dup(s, charset, NULL, FALSE);
142 }
143
144 char *utoa_dup_len(wchar_t const *s, int charset, int *len)
145 {
146 return utoa_internal_dup(s, charset, len, FALSE);
147 }
148
149 char *utoa_careful_dup(wchar_t const *s, int charset)
150 {
151 return utoa_internal_dup(s, charset, NULL, TRUE);
152 }
153
154 wchar_t *ufroma_dup(char const *s, int charset) {
155 int len;
156 wchar_t *buf = NULL;
157
158 len = strlen(s) + 1;
159 do {
160 buf = resize(buf, len);
161 ustrfroma(s, buf, len, charset);
162 len = (3 * len) / 2 + 1; /* this guarantees a strict increase */
163 } while (ustrlen(buf) >= len-1);
164
165 buf = resize(buf, ustrlen(buf)+1);
166 return buf;
167 }
168
169 char *utoa_locale_dup(wchar_t const *s)
170 {
171 /*
172 * This variant uses the C library locale.
173 */
174 char *ret;
175 int len;
176 size_t siz;
177
178 len = ustrlen(s);
179
180 ret = mknewa(char, 1 + MB_CUR_MAX * len);
181
182 siz = wcstombs(ret, s, len);
183
184 if (siz) {
185 assert(siz <= MB_CUR_MAX * len);
186 ret[siz] = '\0';
187 ret = resize(ret, siz+1);
188 return ret;
189 }
190
191 /*
192 * If that failed, try a different strategy (which we will also
193 * attempt in the total absence of wcstombs). Retrieve the
194 * locale's charset from nl_langinfo or equivalent, and use
195 * normal utoa_dup.
196 */
197 return utoa_dup(s, charset_from_locale());
198 }
199
200 wchar_t *ufroma_locale_dup(char const *s)
201 {
202 /*
203 * This variant uses the C library locale.
204 */
205 wchar_t *ret;
206 int len;
207 size_t siz;
208
209 len = strlen(s);
210
211 ret = mknewa(wchar_t, 1 + 2*len); /* be conservative */
212
213 siz = mbstowcs(ret, s, len);
214
215 if (siz) {
216 assert(siz <= (size_t)(2 * len));
217 ret[siz] = L'\0';
218 ret = resize(ret, siz+1);
219 return ret;
220 }
221
222 /*
223 * If that failed, try a different strategy (which we will also
224 * attempt in the total absence of wcstombs). Retrieve the
225 * locale's charset from nl_langinfo or equivalent, and use
226 * normal ufroma_dup.
227 */
228 return ufroma_dup(s, charset_from_locale());
229 }
230
231 int ustrlen(wchar_t const *s) {
232 int len = 0;
233 while (*s++) len++;
234 return len;
235 }
236
237 wchar_t *uadv(wchar_t *s) {
238 return s + 1 + ustrlen(s);
239 }
240
241 wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
242 wchar_t *ret = dest;
243 do {
244 *dest++ = *source;
245 } while (*source++);
246 return ret;
247 }
248
249 wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) {
250 wchar_t *ret = dest;
251 do {
252 *dest++ = *source;
253 if (*source) source++;
254 } while (n-- > 0);
255 return ret;
256 }
257
258 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
259 if (!lhs && !rhs) return 0;
260 if (!lhs) return -1;
261 if (!rhs) return +1;
262 while (*lhs && *rhs && *lhs==*rhs)
263 lhs++, rhs++;
264 if (*lhs < *rhs)
265 return -1;
266 else if (*lhs > *rhs)
267 return 1;
268 return 0;
269 }
270
271 wchar_t utolower(wchar_t c) {
272 if (c == L'\0')
273 return c; /* this property needed by ustricmp */
274 #ifdef HAS_TOWLOWER
275 return towlower(c);
276 #else
277 if (c >= 'A' && c <= 'Z')
278 c += 'a'-'A';
279 return c;
280 #endif
281 }
282
283 int uisalpha(wchar_t c) {
284 #ifdef HAS_ISWALPHA
285 return iswalpha(c);
286 #else
287 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
288 #endif
289 }
290
291 int ustricmp(wchar_t const *lhs, wchar_t const *rhs) {
292 wchar_t lc, rc;
293 while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
294 lhs++, rhs++;
295 if (!lc && !rc)
296 return 0;
297 if (lc < rc)
298 return -1;
299 else
300 return 1;
301 }
302
303 int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) {
304 wchar_t lc = 0, rc = 0;
305 while (maxlen-- > 0 &&
306 (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
307 lhs++, rhs++;
308 if (lc < rc)
309 return -1;
310 else if (lc > rc)
311 return 1;
312 else
313 return 0;
314 }
315
316 wchar_t *ustrlow(wchar_t *s) {
317 wchar_t *p = s;
318 while (*p) {
319 *p = utolower(*p);
320 p++;
321 }
322 return s;
323 }
324
325 int utoi(wchar_t const *s) {
326 int sign = +1;
327 int n;
328
329 if (*s == L'-') {
330 s++;
331 sign = -1;
332 }
333
334 n = 0;
335 while (*s && *s >= L'0' && *s <= L'9') {
336 n *= 10;
337 n += (*s - '0');
338 s++;
339 }
340
341 return n;
342 }
343
344 double utof(wchar_t const *s)
345 {
346 char *cs = utoa_dup(s, CS_ASCII);
347 double ret = atof(cs);
348 sfree(cs);
349 return ret;
350 }
351
352 int utob(wchar_t const *s) {
353 if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
354 !ustricmp(s, L"true") || !ustricmp(s, L"t"))
355 return TRUE;
356 return FALSE;
357 }
358
359 int uisdigit(wchar_t c) {
360 return c >= L'0' && c <= L'9';
361 }
362
363 #define USTRFTIME_DELTA 128
364 static void ustrftime_internal(rdstring *rs, char formatchr,
365 const struct tm *timespec)
366 {
367 /*
368 * strftime has the entertaining property that it returns 0
369 * _either_ on out-of-space _or_ on successful generation of
370 * the empty string. Hence we must ensure our format can never
371 * generate the empty string. Somebody throw a custard pie at
372 * whoever was responsible for that. Please?
373 */
374
375 #ifdef HAS_WCSFTIME
376 wchar_t *buf = NULL;
377 wchar_t fmt[4];
378 int size, ret;
379
380 fmt[0] = L' ';
381 fmt[1] = L'%';
382 /* Format chars are all ASCII, so conversion to Unicode is no problem */
383 fmt[2] = formatchr;
384 fmt[3] = L'\0';
385
386 size = 0;
387 do {
388 size += USTRFTIME_DELTA;
389 buf = resize(buf, size);
390 ret = (int) wcsftime(buf, size, fmt, timespec);
391 } while (ret == 0);
392
393 rdadds(rs, buf+1);
394 sfree(buf);
395 #else
396 char *buf = NULL;
397 wchar_t *cvtbuf;
398 char fmt[4];
399 int size, ret;
400
401 fmt[0] = ' ';
402 fmt[1] = '%';
403 fmt[2] = formatchr;
404 fmt[3] = '\0';
405
406 size = 0;
407 do {
408 size += USTRFTIME_DELTA;
409 buf = resize(buf, size);
410 ret = (int) strftime(buf, size, fmt, timespec);
411 } while (ret == 0);
412
413 cvtbuf = ufroma_locale_dup(buf+1);
414 rdadds(rs, cvtbuf);
415 sfree(cvtbuf);
416 sfree(buf);
417 #endif
418 }
419
420 wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec)
421 {
422 rdstring rs = { 0, 0, NULL };
423
424 if (!wfmt)
425 wfmt = L"%c";
426
427 while (*wfmt) {
428 if (wfmt[0] == L'%' && wfmt[1] == L'%') {
429 rdadd(&rs, L'%');
430 wfmt += 2;
431 } else if (wfmt[0] == L'%' && wfmt[1]) {
432 ustrftime_internal(&rs, wfmt[1], timespec);
433 wfmt += 2;
434 } else {
435 rdadd(&rs, wfmt[0]);
436 wfmt++;
437 }
438 }
439
440 return rdtrim(&rs);
441 }
442
443 /*
444 * Determine whether a Unicode string can be translated into a
445 * given charset without any missing characters.
446 */
447 int cvt_ok(int charset, const wchar_t *s)
448 {
449 char buf[256];
450 charset_state state = CHARSET_INIT_STATE;
451 int err, len = ustrlen(s);
452
453 err = 0;
454 while (len > 0) {
455 (void)charset_from_unicode(&s, &len, buf, lenof(buf),
456 charset, &state, &err);
457 if (err)
458 return FALSE;
459 }
460 return TRUE;
461 }