d7482997 |
1 | /* |
2 | * ustring.c: Unicode string routines |
3 | */ |
4 | |
5 | #include <wchar.h> |
6 | #include <time.h> |
7 | #include "halibut.h" |
8 | |
e4ea58f8 |
9 | wchar_t *ustrdup(wchar_t const *s) { |
d7482997 |
10 | wchar_t *r; |
11 | if (s) { |
12 | r = mknewa(wchar_t, 1+ustrlen(s)); |
13 | ustrcpy(r, s); |
14 | } else { |
15 | r = mknew(wchar_t); |
16 | *r = 0; |
17 | } |
18 | return r; |
19 | } |
20 | |
e4ea58f8 |
21 | static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size, |
22 | int charset, int careful) { |
23 | int len, ret, err; |
24 | charset_state state = CHARSET_INIT_STATE; |
25 | |
d7482997 |
26 | if (!s) { |
27 | *outbuf = '\0'; |
28 | return outbuf; |
29 | } |
e4ea58f8 |
30 | |
31 | len = ustrlen(s); |
32 | size--; /* leave room for terminating NUL */ |
33 | *outbuf = '\0'; |
34 | while (len > 0) { |
35 | err = 0; |
36 | ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state, |
37 | (careful ? &err : NULL)); |
38 | if (err) |
39 | return NULL; |
40 | if (!ret) |
41 | return outbuf; |
42 | size -= ret; |
43 | outbuf += ret; |
44 | *outbuf = '\0'; |
45 | } |
46 | /* |
47 | * Clean up |
48 | */ |
49 | ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL); |
50 | size -= ret; |
51 | outbuf += ret; |
52 | *outbuf = '\0'; |
d7482997 |
53 | return outbuf; |
54 | } |
55 | |
e4ea58f8 |
56 | char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) { |
57 | return ustrtoa_internal(s, outbuf, size, charset, FALSE); |
58 | } |
59 | |
60 | char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) { |
61 | return ustrtoa_internal(s, outbuf, size, charset, TRUE); |
62 | } |
63 | |
64 | wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) { |
65 | int len, ret; |
66 | charset_state state = CHARSET_INIT_STATE; |
67 | |
ba9c1487 |
68 | if (!s) { |
69 | *outbuf = L'\0'; |
70 | return outbuf; |
71 | } |
e4ea58f8 |
72 | |
73 | len = strlen(s); |
74 | size--; /* allow for terminating NUL */ |
75 | *outbuf = L'\0'; |
76 | while (len > 0) { |
77 | ret = charset_to_unicode(&s, &len, outbuf, size, |
78 | charset, &state, NULL, 0); |
79 | if (!ret) |
80 | return outbuf; |
81 | outbuf += ret; |
82 | size -= ret; |
83 | *outbuf = L'\0'; |
84 | } |
ba9c1487 |
85 | return outbuf; |
86 | } |
87 | |
e4ea58f8 |
88 | char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) |
89 | { |
90 | char *outbuf; |
91 | int outpos, outlen, len, ret, err; |
92 | charset_state state = CHARSET_INIT_STATE; |
50d6b4bd |
93 | |
e4ea58f8 |
94 | if (!s) { |
95 | return dupstr(""); |
96 | } |
50d6b4bd |
97 | |
e4ea58f8 |
98 | len = ustrlen(s); |
99 | |
100 | outlen = len + 10; |
101 | outbuf = mknewa(char, outlen); |
102 | |
103 | outpos = 0; |
104 | outbuf[outpos] = '\0'; |
105 | |
106 | while (len > 0) { |
107 | err = 0; |
108 | ret = charset_from_unicode(&s, &len, |
109 | outbuf + outpos, outlen - outpos - 1, |
110 | charset, &state, (careful ? &err : NULL)); |
111 | if (err) { |
112 | sfree(outbuf); |
113 | return NULL; |
114 | } |
115 | if (!ret) { |
116 | outlen = outlen * 3 / 2; |
117 | outbuf = resize(outbuf, outlen); |
118 | } |
119 | outpos += ret; |
120 | outbuf[outpos] = '\0'; |
121 | } |
122 | /* |
123 | * Clean up |
124 | */ |
125 | outlen = outpos + 32; |
126 | outbuf = resize(outbuf, outlen); |
127 | ret = charset_from_unicode(NULL, 0, |
128 | outbuf + outpos, outlen - outpos + 1, |
129 | charset, &state, NULL); |
130 | outpos += ret; |
131 | outbuf[outpos] = '\0'; |
132 | if (lenp) |
133 | *lenp = outpos; |
134 | return outbuf; |
50d6b4bd |
135 | } |
136 | |
e4ea58f8 |
137 | char *utoa_dup(wchar_t const *s, int charset) |
138 | { |
139 | return utoa_internal_dup(s, charset, NULL, FALSE); |
140 | } |
141 | |
142 | char *utoa_dup_len(wchar_t const *s, int charset, int *len) |
143 | { |
144 | return utoa_internal_dup(s, charset, len, FALSE); |
145 | } |
146 | |
147 | char *utoa_careful_dup(wchar_t const *s, int charset) |
148 | { |
149 | return utoa_internal_dup(s, charset, NULL, TRUE); |
150 | } |
151 | |
152 | wchar_t *ufroma_dup(char const *s, int charset) { |
ba9c1487 |
153 | int len; |
154 | wchar_t *buf = NULL; |
155 | |
156 | len = strlen(s) + 1; |
157 | do { |
158 | buf = resize(buf, len); |
e4ea58f8 |
159 | ustrfroma(s, buf, len, charset); |
ba9c1487 |
160 | len = (3 * len) / 2 + 1; /* this guarantees a strict increase */ |
161 | } while (ustrlen(buf) >= len-1); |
162 | |
163 | buf = resize(buf, ustrlen(buf)+1); |
164 | return buf; |
165 | } |
166 | |
5dd44dce |
167 | int ustrlen(wchar_t const *s) { |
d7482997 |
168 | int len = 0; |
169 | while (*s++) len++; |
170 | return len; |
171 | } |
172 | |
173 | wchar_t *uadv(wchar_t *s) { |
174 | return s + 1 + ustrlen(s); |
175 | } |
176 | |
5dd44dce |
177 | wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) { |
d7482997 |
178 | wchar_t *ret = dest; |
179 | do { |
180 | *dest++ = *source; |
181 | } while (*source++); |
182 | return ret; |
183 | } |
184 | |
185 | int ustrcmp(wchar_t *lhs, wchar_t *rhs) { |
186 | if (!lhs && !rhs) return 0; |
187 | if (!lhs) return -1; |
188 | if (!rhs) return +1; |
189 | while (*lhs && *rhs && *lhs==*rhs) |
190 | lhs++, rhs++; |
191 | if (*lhs < *rhs) |
192 | return -1; |
193 | else if (*lhs > *rhs) |
194 | return 1; |
195 | return 0; |
196 | } |
197 | |
198 | wchar_t utolower(wchar_t c) { |
199 | if (c == L'\0') |
200 | return c; /* this property needed by ustricmp */ |
201 | /* FIXME: this doesn't even come close */ |
202 | if (c >= 'A' && c <= 'Z') |
203 | c += 'a'-'A'; |
204 | return c; |
205 | } |
206 | |
831da32e |
207 | int uisalpha(wchar_t c) { |
208 | /* FIXME: this doesn't even come close */ |
209 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
210 | } |
211 | |
d7482997 |
212 | int ustricmp(wchar_t *lhs, wchar_t *rhs) { |
213 | wchar_t lc, rc; |
214 | while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
215 | lhs++, rhs++; |
216 | if (!lc && !rc) |
217 | return 0; |
218 | if (lc < rc) |
219 | return -1; |
220 | else |
221 | return 1; |
222 | } |
223 | |
224 | wchar_t *ustrlow(wchar_t *s) { |
225 | wchar_t *p = s; |
226 | while (*p) { |
227 | *p = utolower(*p); |
228 | p++; |
229 | } |
230 | return s; |
231 | } |
232 | |
233 | int utoi(wchar_t *s) { |
234 | int sign = +1; |
235 | int n; |
236 | |
237 | if (*s == L'-') { |
238 | s++; |
239 | sign = -1; |
240 | } |
241 | |
242 | n = 0; |
243 | while (*s && *s >= L'0' && *s <= L'9') { |
244 | n *= 10; |
245 | n += (*s - '0'); |
246 | s++; |
247 | } |
248 | |
249 | return n; |
250 | } |
251 | |
252 | int utob(wchar_t *s) { |
253 | if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") || |
254 | !ustricmp(s, L"true") || !ustricmp(s, L"t")) |
255 | return TRUE; |
256 | return FALSE; |
257 | } |
258 | |
259 | int uisdigit(wchar_t c) { |
260 | return c >= L'0' && c <= L'9'; |
261 | } |
262 | |
263 | #define USTRFTIME_DELTA 128 |
264 | wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) { |
265 | void *blk = NULL; |
266 | wchar_t *wblk, *wp; |
267 | char *fmt, *text, *p; |
268 | size_t size = 0; |
269 | size_t len; |
270 | |
271 | /* |
e4ea58f8 |
272 | * FIXME: really we ought to copy non-% parts of the format |
273 | * ourselves, and only resort to strftime for % parts. Also we |
274 | * should use wcsftime if it's present. |
275 | */ |
276 | |
277 | /* |
d7482997 |
278 | * strftime has the entertaining property that it returns 0 |
279 | * _either_ on out-of-space _or_ on successful generation of |
280 | * the empty string. Hence we must ensure our format can never |
281 | * generate the empty string. Somebody throw a custard pie at |
282 | * whoever was responsible for that. Please? |
283 | */ |
284 | if (wfmt) { |
285 | len = ustrlen(wfmt); |
286 | fmt = mknewa(char, 2+len); |
e4ea58f8 |
287 | ustrtoa(wfmt, fmt+1, len+1, CS_ASCII); /* CS_FIXME? */ |
d7482997 |
288 | fmt[0] = ' '; |
289 | } else |
290 | fmt = " %c"; |
291 | |
292 | while (1) { |
293 | size += USTRFTIME_DELTA; |
294 | blk = resize((char *)blk, size); |
295 | len = strftime((char *)blk, size-1, fmt, timespec); |
296 | if (len > 0) |
297 | break; |
298 | } |
299 | |
300 | /* Note: +1 for the terminating 0, -1 for the initial space in fmt */ |
301 | wblk = resize((wchar_t *)blk, len); |
302 | text = mknewa(char, len); |
303 | strftime(text, len, fmt+1, timespec); |
304 | /* |
305 | * We operate in the C locale, so this all ought to be kosher |
306 | * ASCII. If we ever move outside ASCII machines, we may need |
307 | * to make this more portable... |
308 | */ |
309 | for (wp = wblk, p = text; *p; p++, wp++) |
310 | *wp = *p; |
311 | *wp = 0; |
312 | if (wfmt) |
313 | sfree(fmt); |
314 | sfree(text); |
315 | return wblk; |
316 | } |