Charset support for the info backend (\cfg{info-charset}). (This
[sgt/halibut] / ustring.c
1 /*
2 * ustring.c: Unicode string routines
3 */
4
5 #include <wchar.h>
6 #include <time.h>
7 #include "halibut.h"
8
9 wchar_t *ustrdup(wchar_t const *s) {
10 wchar_t *r;
11 if (s) {
12 r = mknewa(wchar_t, 1+ustrlen(s));
13 ustrcpy(r, s);
14 } else {
15 r = mknew(wchar_t);
16 *r = 0;
17 }
18 return r;
19 }
20
21 static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
22 int charset, int careful) {
23 int len, ret, err;
24 charset_state state = CHARSET_INIT_STATE;
25
26 if (!s) {
27 *outbuf = '\0';
28 return outbuf;
29 }
30
31 len = ustrlen(s);
32 size--; /* leave room for terminating NUL */
33 *outbuf = '\0';
34 while (len > 0) {
35 err = 0;
36 ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
37 (careful ? &err : NULL));
38 if (err)
39 return NULL;
40 if (!ret)
41 return outbuf;
42 size -= ret;
43 outbuf += ret;
44 *outbuf = '\0';
45 }
46 /*
47 * Clean up
48 */
49 ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
50 size -= ret;
51 outbuf += ret;
52 *outbuf = '\0';
53 return outbuf;
54 }
55
56 char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
57 return ustrtoa_internal(s, outbuf, size, charset, FALSE);
58 }
59
60 char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
61 return ustrtoa_internal(s, outbuf, size, charset, TRUE);
62 }
63
64 wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
65 int len, ret;
66 charset_state state = CHARSET_INIT_STATE;
67
68 if (!s) {
69 *outbuf = L'\0';
70 return outbuf;
71 }
72
73 len = strlen(s);
74 size--; /* allow for terminating NUL */
75 *outbuf = L'\0';
76 while (len > 0) {
77 ret = charset_to_unicode(&s, &len, outbuf, size,
78 charset, &state, NULL, 0);
79 if (!ret)
80 return outbuf;
81 outbuf += ret;
82 size -= ret;
83 *outbuf = L'\0';
84 }
85 return outbuf;
86 }
87
88 char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
89 {
90 char *outbuf;
91 int outpos, outlen, len, ret, err;
92 charset_state state = CHARSET_INIT_STATE;
93
94 if (!s) {
95 return dupstr("");
96 }
97
98 len = ustrlen(s);
99
100 outlen = len + 10;
101 outbuf = mknewa(char, outlen);
102
103 outpos = 0;
104 outbuf[outpos] = '\0';
105
106 while (len > 0) {
107 err = 0;
108 ret = charset_from_unicode(&s, &len,
109 outbuf + outpos, outlen - outpos - 1,
110 charset, &state, (careful ? &err : NULL));
111 if (err) {
112 sfree(outbuf);
113 return NULL;
114 }
115 if (!ret) {
116 outlen = outlen * 3 / 2;
117 outbuf = resize(outbuf, outlen);
118 }
119 outpos += ret;
120 outbuf[outpos] = '\0';
121 }
122 /*
123 * Clean up
124 */
125 outlen = outpos + 32;
126 outbuf = resize(outbuf, outlen);
127 ret = charset_from_unicode(NULL, 0,
128 outbuf + outpos, outlen - outpos + 1,
129 charset, &state, NULL);
130 outpos += ret;
131 outbuf[outpos] = '\0';
132 if (lenp)
133 *lenp = outpos;
134 return outbuf;
135 }
136
137 char *utoa_dup(wchar_t const *s, int charset)
138 {
139 return utoa_internal_dup(s, charset, NULL, FALSE);
140 }
141
142 char *utoa_dup_len(wchar_t const *s, int charset, int *len)
143 {
144 return utoa_internal_dup(s, charset, len, FALSE);
145 }
146
147 char *utoa_careful_dup(wchar_t const *s, int charset)
148 {
149 return utoa_internal_dup(s, charset, NULL, TRUE);
150 }
151
152 wchar_t *ufroma_dup(char const *s, int charset) {
153 int len;
154 wchar_t *buf = NULL;
155
156 len = strlen(s) + 1;
157 do {
158 buf = resize(buf, len);
159 ustrfroma(s, buf, len, charset);
160 len = (3 * len) / 2 + 1; /* this guarantees a strict increase */
161 } while (ustrlen(buf) >= len-1);
162
163 buf = resize(buf, ustrlen(buf)+1);
164 return buf;
165 }
166
167 int ustrlen(wchar_t const *s) {
168 int len = 0;
169 while (*s++) len++;
170 return len;
171 }
172
173 wchar_t *uadv(wchar_t *s) {
174 return s + 1 + ustrlen(s);
175 }
176
177 wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
178 wchar_t *ret = dest;
179 do {
180 *dest++ = *source;
181 } while (*source++);
182 return ret;
183 }
184
185 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
186 if (!lhs && !rhs) return 0;
187 if (!lhs) return -1;
188 if (!rhs) return +1;
189 while (*lhs && *rhs && *lhs==*rhs)
190 lhs++, rhs++;
191 if (*lhs < *rhs)
192 return -1;
193 else if (*lhs > *rhs)
194 return 1;
195 return 0;
196 }
197
198 wchar_t utolower(wchar_t c) {
199 if (c == L'\0')
200 return c; /* this property needed by ustricmp */
201 /* FIXME: this doesn't even come close */
202 if (c >= 'A' && c <= 'Z')
203 c += 'a'-'A';
204 return c;
205 }
206
207 int uisalpha(wchar_t c) {
208 /* FIXME: this doesn't even come close */
209 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
210 }
211
212 int ustricmp(wchar_t *lhs, wchar_t *rhs) {
213 wchar_t lc, rc;
214 while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
215 lhs++, rhs++;
216 if (!lc && !rc)
217 return 0;
218 if (lc < rc)
219 return -1;
220 else
221 return 1;
222 }
223
224 wchar_t *ustrlow(wchar_t *s) {
225 wchar_t *p = s;
226 while (*p) {
227 *p = utolower(*p);
228 p++;
229 }
230 return s;
231 }
232
233 int utoi(wchar_t *s) {
234 int sign = +1;
235 int n;
236
237 if (*s == L'-') {
238 s++;
239 sign = -1;
240 }
241
242 n = 0;
243 while (*s && *s >= L'0' && *s <= L'9') {
244 n *= 10;
245 n += (*s - '0');
246 s++;
247 }
248
249 return n;
250 }
251
252 int utob(wchar_t *s) {
253 if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
254 !ustricmp(s, L"true") || !ustricmp(s, L"t"))
255 return TRUE;
256 return FALSE;
257 }
258
259 int uisdigit(wchar_t c) {
260 return c >= L'0' && c <= L'9';
261 }
262
263 #define USTRFTIME_DELTA 128
264 wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) {
265 void *blk = NULL;
266 wchar_t *wblk, *wp;
267 char *fmt, *text, *p;
268 size_t size = 0;
269 size_t len;
270
271 /*
272 * FIXME: really we ought to copy non-% parts of the format
273 * ourselves, and only resort to strftime for % parts. Also we
274 * should use wcsftime if it's present.
275 */
276
277 /*
278 * strftime has the entertaining property that it returns 0
279 * _either_ on out-of-space _or_ on successful generation of
280 * the empty string. Hence we must ensure our format can never
281 * generate the empty string. Somebody throw a custard pie at
282 * whoever was responsible for that. Please?
283 */
284 if (wfmt) {
285 len = ustrlen(wfmt);
286 fmt = mknewa(char, 2+len);
287 ustrtoa(wfmt, fmt+1, len+1, CS_ASCII); /* CS_FIXME? */
288 fmt[0] = ' ';
289 } else
290 fmt = " %c";
291
292 while (1) {
293 size += USTRFTIME_DELTA;
294 blk = resize((char *)blk, size);
295 len = strftime((char *)blk, size-1, fmt, timespec);
296 if (len > 0)
297 break;
298 }
299
300 /* Note: +1 for the terminating 0, -1 for the initial space in fmt */
301 wblk = resize((wchar_t *)blk, len);
302 text = mknewa(char, len);
303 strftime(text, len, fmt+1, timespec);
304 /*
305 * We operate in the C locale, so this all ought to be kosher
306 * ASCII. If we ever move outside ASCII machines, we may need
307 * to make this more portable...
308 */
309 for (wp = wblk, p = text; *p; p++, wp++)
310 *wp = *p;
311 *wp = 0;
312 if (wfmt)
313 sfree(fmt);
314 sfree(text);
315 return wblk;
316 }
317
318 /*
319 * Determine whether a Unicode string can be translated into a
320 * given charset without any missing characters.
321 */
322 int cvt_ok(int charset, const wchar_t *s)
323 {
324 char buf[256];
325 charset_state state = CHARSET_INIT_STATE;
326 int err, len = ustrlen(s);
327
328 err = 0;
329 while (len > 0) {
330 (void)charset_from_unicode(&s, &len, buf, lenof(buf),
331 charset, &state, &err);
332 if (err)
333 return FALSE;
334 }
335 return TRUE;
336 }