| 1 | /* |
| 2 | * ustring.c: Unicode string routines |
| 3 | */ |
| 4 | |
| 5 | #include <wchar.h> |
| 6 | #include <time.h> |
| 7 | #include "halibut.h" |
| 8 | |
| 9 | wchar_t *ustrdup(wchar_t const *s) { |
| 10 | wchar_t *r; |
| 11 | if (s) { |
| 12 | r = mknewa(wchar_t, 1+ustrlen(s)); |
| 13 | ustrcpy(r, s); |
| 14 | } else { |
| 15 | r = mknew(wchar_t); |
| 16 | *r = 0; |
| 17 | } |
| 18 | return r; |
| 19 | } |
| 20 | |
| 21 | static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size, |
| 22 | int charset, int careful) { |
| 23 | int len, ret, err; |
| 24 | charset_state state = CHARSET_INIT_STATE; |
| 25 | |
| 26 | if (!s) { |
| 27 | *outbuf = '\0'; |
| 28 | return outbuf; |
| 29 | } |
| 30 | |
| 31 | len = ustrlen(s); |
| 32 | size--; /* leave room for terminating NUL */ |
| 33 | *outbuf = '\0'; |
| 34 | while (len > 0) { |
| 35 | err = 0; |
| 36 | ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state, |
| 37 | (careful ? &err : NULL)); |
| 38 | if (err) |
| 39 | return NULL; |
| 40 | if (!ret) |
| 41 | return outbuf; |
| 42 | size -= ret; |
| 43 | outbuf += ret; |
| 44 | *outbuf = '\0'; |
| 45 | } |
| 46 | /* |
| 47 | * Clean up |
| 48 | */ |
| 49 | ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL); |
| 50 | size -= ret; |
| 51 | outbuf += ret; |
| 52 | *outbuf = '\0'; |
| 53 | return outbuf; |
| 54 | } |
| 55 | |
| 56 | char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) { |
| 57 | return ustrtoa_internal(s, outbuf, size, charset, FALSE); |
| 58 | } |
| 59 | |
| 60 | char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) { |
| 61 | return ustrtoa_internal(s, outbuf, size, charset, TRUE); |
| 62 | } |
| 63 | |
| 64 | wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) { |
| 65 | int len, ret; |
| 66 | charset_state state = CHARSET_INIT_STATE; |
| 67 | |
| 68 | if (!s) { |
| 69 | *outbuf = L'\0'; |
| 70 | return outbuf; |
| 71 | } |
| 72 | |
| 73 | len = strlen(s); |
| 74 | size--; /* allow for terminating NUL */ |
| 75 | *outbuf = L'\0'; |
| 76 | while (len > 0) { |
| 77 | ret = charset_to_unicode(&s, &len, outbuf, size, |
| 78 | charset, &state, NULL, 0); |
| 79 | if (!ret) |
| 80 | return outbuf; |
| 81 | outbuf += ret; |
| 82 | size -= ret; |
| 83 | *outbuf = L'\0'; |
| 84 | } |
| 85 | return outbuf; |
| 86 | } |
| 87 | |
| 88 | char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful) |
| 89 | { |
| 90 | char *outbuf; |
| 91 | int outpos, outlen, len, ret, err; |
| 92 | charset_state state = CHARSET_INIT_STATE; |
| 93 | |
| 94 | if (!s) { |
| 95 | return dupstr(""); |
| 96 | } |
| 97 | |
| 98 | len = ustrlen(s); |
| 99 | |
| 100 | outlen = len + 10; |
| 101 | outbuf = mknewa(char, outlen); |
| 102 | |
| 103 | outpos = 0; |
| 104 | outbuf[outpos] = '\0'; |
| 105 | |
| 106 | while (len > 0) { |
| 107 | err = 0; |
| 108 | ret = charset_from_unicode(&s, &len, |
| 109 | outbuf + outpos, outlen - outpos - 1, |
| 110 | charset, &state, (careful ? &err : NULL)); |
| 111 | if (err) { |
| 112 | sfree(outbuf); |
| 113 | return NULL; |
| 114 | } |
| 115 | if (!ret) { |
| 116 | outlen = outlen * 3 / 2; |
| 117 | outbuf = resize(outbuf, outlen); |
| 118 | } |
| 119 | outpos += ret; |
| 120 | outbuf[outpos] = '\0'; |
| 121 | } |
| 122 | /* |
| 123 | * Clean up |
| 124 | */ |
| 125 | outlen = outpos + 32; |
| 126 | outbuf = resize(outbuf, outlen); |
| 127 | ret = charset_from_unicode(NULL, 0, |
| 128 | outbuf + outpos, outlen - outpos + 1, |
| 129 | charset, &state, NULL); |
| 130 | outpos += ret; |
| 131 | outbuf[outpos] = '\0'; |
| 132 | if (lenp) |
| 133 | *lenp = outpos; |
| 134 | return outbuf; |
| 135 | } |
| 136 | |
| 137 | char *utoa_dup(wchar_t const *s, int charset) |
| 138 | { |
| 139 | return utoa_internal_dup(s, charset, NULL, FALSE); |
| 140 | } |
| 141 | |
| 142 | char *utoa_dup_len(wchar_t const *s, int charset, int *len) |
| 143 | { |
| 144 | return utoa_internal_dup(s, charset, len, FALSE); |
| 145 | } |
| 146 | |
| 147 | char *utoa_careful_dup(wchar_t const *s, int charset) |
| 148 | { |
| 149 | return utoa_internal_dup(s, charset, NULL, TRUE); |
| 150 | } |
| 151 | |
| 152 | wchar_t *ufroma_dup(char const *s, int charset) { |
| 153 | int len; |
| 154 | wchar_t *buf = NULL; |
| 155 | |
| 156 | len = strlen(s) + 1; |
| 157 | do { |
| 158 | buf = resize(buf, len); |
| 159 | ustrfroma(s, buf, len, charset); |
| 160 | len = (3 * len) / 2 + 1; /* this guarantees a strict increase */ |
| 161 | } while (ustrlen(buf) >= len-1); |
| 162 | |
| 163 | buf = resize(buf, ustrlen(buf)+1); |
| 164 | return buf; |
| 165 | } |
| 166 | |
| 167 | int ustrlen(wchar_t const *s) { |
| 168 | int len = 0; |
| 169 | while (*s++) len++; |
| 170 | return len; |
| 171 | } |
| 172 | |
| 173 | wchar_t *uadv(wchar_t *s) { |
| 174 | return s + 1 + ustrlen(s); |
| 175 | } |
| 176 | |
| 177 | wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) { |
| 178 | wchar_t *ret = dest; |
| 179 | do { |
| 180 | *dest++ = *source; |
| 181 | } while (*source++); |
| 182 | return ret; |
| 183 | } |
| 184 | |
| 185 | int ustrcmp(wchar_t *lhs, wchar_t *rhs) { |
| 186 | if (!lhs && !rhs) return 0; |
| 187 | if (!lhs) return -1; |
| 188 | if (!rhs) return +1; |
| 189 | while (*lhs && *rhs && *lhs==*rhs) |
| 190 | lhs++, rhs++; |
| 191 | if (*lhs < *rhs) |
| 192 | return -1; |
| 193 | else if (*lhs > *rhs) |
| 194 | return 1; |
| 195 | return 0; |
| 196 | } |
| 197 | |
| 198 | wchar_t utolower(wchar_t c) { |
| 199 | if (c == L'\0') |
| 200 | return c; /* this property needed by ustricmp */ |
| 201 | /* FIXME: this doesn't even come close */ |
| 202 | if (c >= 'A' && c <= 'Z') |
| 203 | c += 'a'-'A'; |
| 204 | return c; |
| 205 | } |
| 206 | |
| 207 | int uisalpha(wchar_t c) { |
| 208 | /* FIXME: this doesn't even come close */ |
| 209 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
| 210 | } |
| 211 | |
| 212 | int ustricmp(wchar_t *lhs, wchar_t *rhs) { |
| 213 | wchar_t lc, rc; |
| 214 | while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc) |
| 215 | lhs++, rhs++; |
| 216 | if (!lc && !rc) |
| 217 | return 0; |
| 218 | if (lc < rc) |
| 219 | return -1; |
| 220 | else |
| 221 | return 1; |
| 222 | } |
| 223 | |
| 224 | wchar_t *ustrlow(wchar_t *s) { |
| 225 | wchar_t *p = s; |
| 226 | while (*p) { |
| 227 | *p = utolower(*p); |
| 228 | p++; |
| 229 | } |
| 230 | return s; |
| 231 | } |
| 232 | |
| 233 | int utoi(wchar_t *s) { |
| 234 | int sign = +1; |
| 235 | int n; |
| 236 | |
| 237 | if (*s == L'-') { |
| 238 | s++; |
| 239 | sign = -1; |
| 240 | } |
| 241 | |
| 242 | n = 0; |
| 243 | while (*s && *s >= L'0' && *s <= L'9') { |
| 244 | n *= 10; |
| 245 | n += (*s - '0'); |
| 246 | s++; |
| 247 | } |
| 248 | |
| 249 | return n; |
| 250 | } |
| 251 | |
| 252 | int utob(wchar_t *s) { |
| 253 | if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") || |
| 254 | !ustricmp(s, L"true") || !ustricmp(s, L"t")) |
| 255 | return TRUE; |
| 256 | return FALSE; |
| 257 | } |
| 258 | |
| 259 | int uisdigit(wchar_t c) { |
| 260 | return c >= L'0' && c <= L'9'; |
| 261 | } |
| 262 | |
| 263 | #define USTRFTIME_DELTA 128 |
| 264 | wchar_t *ustrftime(wchar_t *wfmt, struct tm *timespec) { |
| 265 | void *blk = NULL; |
| 266 | wchar_t *wblk, *wp; |
| 267 | char *fmt, *text, *p; |
| 268 | size_t size = 0; |
| 269 | size_t len; |
| 270 | |
| 271 | /* |
| 272 | * FIXME: really we ought to copy non-% parts of the format |
| 273 | * ourselves, and only resort to strftime for % parts. Also we |
| 274 | * should use wcsftime if it's present. |
| 275 | */ |
| 276 | |
| 277 | /* |
| 278 | * strftime has the entertaining property that it returns 0 |
| 279 | * _either_ on out-of-space _or_ on successful generation of |
| 280 | * the empty string. Hence we must ensure our format can never |
| 281 | * generate the empty string. Somebody throw a custard pie at |
| 282 | * whoever was responsible for that. Please? |
| 283 | */ |
| 284 | if (wfmt) { |
| 285 | len = ustrlen(wfmt); |
| 286 | fmt = mknewa(char, 2+len); |
| 287 | ustrtoa(wfmt, fmt+1, len+1, CS_ASCII); /* CS_FIXME? */ |
| 288 | fmt[0] = ' '; |
| 289 | } else |
| 290 | fmt = " %c"; |
| 291 | |
| 292 | while (1) { |
| 293 | size += USTRFTIME_DELTA; |
| 294 | blk = resize((char *)blk, size); |
| 295 | len = strftime((char *)blk, size-1, fmt, timespec); |
| 296 | if (len > 0) |
| 297 | break; |
| 298 | } |
| 299 | |
| 300 | /* Note: +1 for the terminating 0, -1 for the initial space in fmt */ |
| 301 | wblk = resize((wchar_t *)blk, len); |
| 302 | text = mknewa(char, len); |
| 303 | strftime(text, len, fmt+1, timespec); |
| 304 | /* |
| 305 | * We operate in the C locale, so this all ought to be kosher |
| 306 | * ASCII. If we ever move outside ASCII machines, we may need |
| 307 | * to make this more portable... |
| 308 | */ |
| 309 | for (wp = wblk, p = text; *p; p++, wp++) |
| 310 | *wp = *p; |
| 311 | *wp = 0; |
| 312 | if (wfmt) |
| 313 | sfree(fmt); |
| 314 | sfree(text); |
| 315 | return wblk; |
| 316 | } |
| 317 | |
| 318 | /* |
| 319 | * Determine whether a Unicode string can be translated into a |
| 320 | * given charset without any missing characters. |
| 321 | */ |
| 322 | int cvt_ok(int charset, const wchar_t *s) |
| 323 | { |
| 324 | char buf[256]; |
| 325 | charset_state state = CHARSET_INIT_STATE; |
| 326 | int err, len = ustrlen(s); |
| 327 | |
| 328 | err = 0; |
| 329 | while (len > 0) { |
| 330 | (void)charset_from_unicode(&s, &len, buf, lenof(buf), |
| 331 | charset, &state, &err); |
| 332 | if (err) |
| 333 | return FALSE; |
| 334 | } |
| 335 | return TRUE; |
| 336 | } |