Use wcscoll(), if available, when sorting index terms. (In a
[sgt/halibut] / ustring.c
CommitLineData
d7482997 1/*
2 * ustring.c: Unicode string routines
3 */
4
5#include <wchar.h>
7e976207 6#include <stdlib.h>
7#include <assert.h>
d7482997 8#include <time.h>
9#include "halibut.h"
10
e4ea58f8 11wchar_t *ustrdup(wchar_t const *s) {
d7482997 12 wchar_t *r;
13 if (s) {
14 r = mknewa(wchar_t, 1+ustrlen(s));
15 ustrcpy(r, s);
16 } else {
17 r = mknew(wchar_t);
18 *r = 0;
19 }
20 return r;
21}
22
e4ea58f8 23static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
24 int charset, int careful) {
25 int len, ret, err;
26 charset_state state = CHARSET_INIT_STATE;
27
d7482997 28 if (!s) {
29 *outbuf = '\0';
30 return outbuf;
31 }
e4ea58f8 32
33 len = ustrlen(s);
34 size--; /* leave room for terminating NUL */
35 *outbuf = '\0';
36 while (len > 0) {
37 err = 0;
38 ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
39 (careful ? &err : NULL));
40 if (err)
41 return NULL;
42 if (!ret)
43 return outbuf;
44 size -= ret;
45 outbuf += ret;
46 *outbuf = '\0';
47 }
48 /*
49 * Clean up
50 */
51 ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
52 size -= ret;
53 outbuf += ret;
54 *outbuf = '\0';
d7482997 55 return outbuf;
56}
57
e4ea58f8 58char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
59 return ustrtoa_internal(s, outbuf, size, charset, FALSE);
60}
61
62char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
63 return ustrtoa_internal(s, outbuf, size, charset, TRUE);
64}
65
66wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
67 int len, ret;
68 charset_state state = CHARSET_INIT_STATE;
69
ba9c1487 70 if (!s) {
71 *outbuf = L'\0';
72 return outbuf;
73 }
e4ea58f8 74
75 len = strlen(s);
76 size--; /* allow for terminating NUL */
77 *outbuf = L'\0';
78 while (len > 0) {
79 ret = charset_to_unicode(&s, &len, outbuf, size,
80 charset, &state, NULL, 0);
81 if (!ret)
82 return outbuf;
83 outbuf += ret;
84 size -= ret;
85 *outbuf = L'\0';
86 }
ba9c1487 87 return outbuf;
88}
89
e4ea58f8 90char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
91{
92 char *outbuf;
93 int outpos, outlen, len, ret, err;
94 charset_state state = CHARSET_INIT_STATE;
50d6b4bd 95
e4ea58f8 96 if (!s) {
97 return dupstr("");
98 }
50d6b4bd 99
e4ea58f8 100 len = ustrlen(s);
101
102 outlen = len + 10;
103 outbuf = mknewa(char, outlen);
104
105 outpos = 0;
106 outbuf[outpos] = '\0';
107
108 while (len > 0) {
109 err = 0;
110 ret = charset_from_unicode(&s, &len,
111 outbuf + outpos, outlen - outpos - 1,
112 charset, &state, (careful ? &err : NULL));
113 if (err) {
114 sfree(outbuf);
115 return NULL;
116 }
117 if (!ret) {
118 outlen = outlen * 3 / 2;
119 outbuf = resize(outbuf, outlen);
120 }
121 outpos += ret;
122 outbuf[outpos] = '\0';
123 }
124 /*
125 * Clean up
126 */
127 outlen = outpos + 32;
128 outbuf = resize(outbuf, outlen);
129 ret = charset_from_unicode(NULL, 0,
130 outbuf + outpos, outlen - outpos + 1,
131 charset, &state, NULL);
132 outpos += ret;
133 outbuf[outpos] = '\0';
134 if (lenp)
135 *lenp = outpos;
136 return outbuf;
50d6b4bd 137}
138
e4ea58f8 139char *utoa_dup(wchar_t const *s, int charset)
140{
141 return utoa_internal_dup(s, charset, NULL, FALSE);
142}
143
144char *utoa_dup_len(wchar_t const *s, int charset, int *len)
145{
146 return utoa_internal_dup(s, charset, len, FALSE);
147}
148
149char *utoa_careful_dup(wchar_t const *s, int charset)
150{
151 return utoa_internal_dup(s, charset, NULL, TRUE);
152}
153
154wchar_t *ufroma_dup(char const *s, int charset) {
ba9c1487 155 int len;
156 wchar_t *buf = NULL;
157
158 len = strlen(s) + 1;
159 do {
160 buf = resize(buf, len);
e4ea58f8 161 ustrfroma(s, buf, len, charset);
ba9c1487 162 len = (3 * len) / 2 + 1; /* this guarantees a strict increase */
163 } while (ustrlen(buf) >= len-1);
164
165 buf = resize(buf, ustrlen(buf)+1);
166 return buf;
167}
168
7e976207 169char *utoa_locale_dup(wchar_t const *s)
170{
171 /*
172 * This variant uses the C library locale.
173 */
174 char *ret;
175 int len;
176 size_t siz;
177
178 len = ustrlen(s);
179
180 ret = mknewa(char, 1 + MB_CUR_MAX * len);
181
182 siz = wcstombs(ret, s, len);
183
184 if (siz) {
185 assert(siz <= MB_CUR_MAX * len);
186 ret[siz] = '\0';
187 ret = resize(ret, siz+1);
188 return ret;
189 }
190
191 /*
192 * If that failed, try a different strategy (which we will also
193 * attempt in the total absence of wcstombs). Retrieve the
194 * locale's charset from nl_langinfo or equivalent, and use
195 * normal utoa_dup.
196 */
197 return utoa_dup(s, charset_from_locale());
198}
199
200wchar_t *ufroma_locale_dup(char const *s)
201{
202 /*
203 * This variant uses the C library locale.
204 */
205 wchar_t *ret;
206 int len;
207 size_t siz;
208
209 len = strlen(s);
210
211 ret = mknewa(wchar_t, 1 + 2*len); /* be conservative */
212
213 siz = mbstowcs(ret, s, len);
214
215 if (siz) {
216 assert(siz <= (size_t)(2 * len));
217 ret[siz] = L'\0';
218 ret = resize(ret, siz+1);
219 return ret;
220 }
221
222 /*
223 * If that failed, try a different strategy (which we will also
224 * attempt in the total absence of wcstombs). Retrieve the
225 * locale's charset from nl_langinfo or equivalent, and use
226 * normal ufroma_dup.
227 */
228 return ufroma_dup(s, charset_from_locale());
229}
230
5dd44dce 231int ustrlen(wchar_t const *s) {
d7482997 232 int len = 0;
233 while (*s++) len++;
234 return len;
235}
236
237wchar_t *uadv(wchar_t *s) {
238 return s + 1 + ustrlen(s);
239}
240
5dd44dce 241wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
d7482997 242 wchar_t *ret = dest;
243 do {
244 *dest++ = *source;
245 } while (*source++);
246 return ret;
247}
248
249int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
250 if (!lhs && !rhs) return 0;
251 if (!lhs) return -1;
252 if (!rhs) return +1;
253 while (*lhs && *rhs && *lhs==*rhs)
254 lhs++, rhs++;
255 if (*lhs < *rhs)
256 return -1;
257 else if (*lhs > *rhs)
258 return 1;
259 return 0;
260}
261
262wchar_t utolower(wchar_t c) {
263 if (c == L'\0')
264 return c; /* this property needed by ustricmp */
9badd775 265#ifdef HAS_TOWLOWER
266 return towlower(c);
267#else
d7482997 268 if (c >= 'A' && c <= 'Z')
269 c += 'a'-'A';
270 return c;
9badd775 271#endif
d7482997 272}
273
831da32e 274int uisalpha(wchar_t c) {
9badd775 275#ifdef HAS_ISWALPHA
276 return iswalpha(c);
277#else
831da32e 278 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
9badd775 279#endif
831da32e 280}
281
d7482997 282int ustricmp(wchar_t *lhs, wchar_t *rhs) {
283 wchar_t lc, rc;
284 while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
285 lhs++, rhs++;
286 if (!lc && !rc)
287 return 0;
288 if (lc < rc)
289 return -1;
290 else
291 return 1;
292}
293
294wchar_t *ustrlow(wchar_t *s) {
295 wchar_t *p = s;
296 while (*p) {
297 *p = utolower(*p);
298 p++;
299 }
300 return s;
301}
302
303int utoi(wchar_t *s) {
304 int sign = +1;
305 int n;
306
307 if (*s == L'-') {
308 s++;
309 sign = -1;
310 }
311
312 n = 0;
313 while (*s && *s >= L'0' && *s <= L'9') {
314 n *= 10;
315 n += (*s - '0');
316 s++;
317 }
318
319 return n;
320}
321
322int utob(wchar_t *s) {
323 if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
324 !ustricmp(s, L"true") || !ustricmp(s, L"t"))
325 return TRUE;
326 return FALSE;
327}
328
329int uisdigit(wchar_t c) {
330 return c >= L'0' && c <= L'9';
331}
332
333#define USTRFTIME_DELTA 128
334wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) {
335 void *blk = NULL;
336 wchar_t *wblk, *wp;
337 char *fmt, *text, *p;
338 size_t size = 0;
339 size_t len;
340
341 /*
e4ea58f8 342 * FIXME: really we ought to copy non-% parts of the format
343 * ourselves, and only resort to strftime for % parts. Also we
344 * should use wcsftime if it's present.
345 */
346
347 /*
d7482997 348 * strftime has the entertaining property that it returns 0
349 * _either_ on out-of-space _or_ on successful generation of
350 * the empty string. Hence we must ensure our format can never
351 * generate the empty string. Somebody throw a custard pie at
352 * whoever was responsible for that. Please?
353 */
354 if (wfmt) {
355 len = ustrlen(wfmt);
356 fmt = mknewa(char, 2+len);
e4ea58f8 357 ustrtoa(wfmt, fmt+1, len+1, CS_ASCII); /* CS_FIXME? */
d7482997 358 fmt[0] = ' ';
359 } else
360 fmt = " %c";
361
362 while (1) {
363 size += USTRFTIME_DELTA;
364 blk = resize((char *)blk, size);
365 len = strftime((char *)blk, size-1, fmt, timespec);
366 if (len > 0)
367 break;
368 }
369
370 /* Note: +1 for the terminating 0, -1 for the initial space in fmt */
371 wblk = resize((wchar_t *)blk, len);
372 text = mknewa(char, len);
373 strftime(text, len, fmt+1, timespec);
374 /*
375 * We operate in the C locale, so this all ought to be kosher
376 * ASCII. If we ever move outside ASCII machines, we may need
377 * to make this more portable...
378 */
379 for (wp = wblk, p = text; *p; p++, wp++)
380 *wp = *p;
381 *wp = 0;
382 if (wfmt)
383 sfree(fmt);
384 sfree(text);
385 return wblk;
386}
91f93b94 387
388/*
389 * Determine whether a Unicode string can be translated into a
390 * given charset without any missing characters.
391 */
392int cvt_ok(int charset, const wchar_t *s)
393{
394 char buf[256];
395 charset_state state = CHARSET_INIT_STATE;
396 int err, len = ustrlen(s);
397
398 err = 0;
399 while (len > 0) {
400 (void)charset_from_unicode(&s, &len, buf, lenof(buf),
401 charset, &state, &err);
402 if (err)
403 return FALSE;
404 }
405 return TRUE;
406}