2 * This file is part of DisOrder
3 * Copyright (C) 2007 Richard Kettlewell
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 /** @file lib/unicode.c
21 * @brief Unicode support functions
23 * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
24 * encoding schemes). The primary encoding form is UTF-32 but convenience
25 * wrappers using UTF-8 are provided for a number of functions.
27 * The idea is that all the strings that hit the database will be in a
28 * particular normalization form, and for the search and tags database
29 * in case-folded form, so they can be naively compared within the
32 * As the code stands this guarantee is not well met!
39 #include <stdio.h> /* TODO */
46 /** @defgroup utftransform Functions that transform between different Unicode encoding forms */
49 /** @brief Convert UTF-32 to UTF-8
50 * @param s Source string
51 * @param ns Length of source string in code points
52 * @param ndp Where to store length of destination string (or NULL)
53 * @return Newly allocated destination string or NULL on error
55 * If the UTF-32 is not valid then NULL is returned. A UTF-32 code point is
57 * - it codes for a UTF-16 surrogate
58 * - it codes for a value outside the unicode code space
60 * The return value is always 0-terminated. The value returned via @p *ndp
61 * does not include the terminator.
63 char *utf32_to_utf8(const uint32_t *s
, size_t ns
, size_t *ndp
) {
73 dynstr_append(&d
, 0xC0 | (c
>> 6));
74 dynstr_append(&d
, 0x80 | (c
& 0x3F));
75 } else if(c
< 0x10000) {
76 if(c
>= 0xD800 && c
<= 0xDFFF)
78 dynstr_append(&d
, 0xE0 | (c
>> 12));
79 dynstr_append(&d
, 0x80 | ((c
>> 6) & 0x3F));
80 dynstr_append(&d
, 0x80 | (c
& 0x3F));
81 } else if(c
< 0x110000) {
82 dynstr_append(&d
, 0xF0 | (c
>> 18));
83 dynstr_append(&d
, 0x80 | ((c
>> 12) & 0x3F));
84 dynstr_append(&d
, 0x80 | ((c
>> 6) & 0x3F));
85 dynstr_append(&d
, 0x80 | (c
& 0x3F));
99 /** @brief Convert UTF-8 to UTF-32
100 * @param s Source string
101 * @param ns Length of source string in code points
102 * @param ndp Where to store length of destination string (or NULL)
103 * @return Newly allocated destination string or NULL
105 * The return value is always 0-terminated. The value returned via @p *ndp
106 * does not include the terminator.
108 * If the UTF-8 is not valid then NULL is returned. A UTF-8 sequence
109 * for a code point is invalid if:
110 * - it is not the shortest possible sequence for the code point
111 * - it codes for a UTF-16 surrogate
112 * - it codes for a value outside the unicode code space
114 uint32_t *utf8_to_utf32(const char *s
, size_t ns
, size_t *ndp
) {
115 struct dynstr_ucs4 d
;
117 const uint8_t *ss
= (const uint8_t *)s
;
119 dynstr_ucs4_init(&d
);
123 /* Acceptable UTF-8 is that which codes for Unicode Scalar Values
124 * (Unicode 5.0.0 s3.9 D76)
127 * 7 data bits gives 0x00 - 0x7F and all are acceptable
130 * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
132 * 1110xxxx 10xxxxxx 10xxxxxx
133 * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
134 * (and UTF-16 surrogates are not acceptable)
136 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
137 * 21 data bits gives 0x00000000 - 0x001FFFFF
138 * but only 0x00010000 - 0x0010FFFF are acceptable
140 * It is NOT always the case that the data bits in the first byte are
141 * always non-0 for the acceptable values, so we do a separate check after
147 if(ns
< 1) goto error
;
150 if((c
& 0xC0) != 0x80) goto error
;
151 c32
= (c32
<< 6) | (c
& 0x3F);
152 if(c32
< 0x80) goto error
;
153 } else if(c
<= 0xEF) {
154 if(ns
< 2) goto error
;
157 if((c
& 0xC0) != 0x80) goto error
;
158 c32
= (c32
<< 6) | (c
& 0x3F);
160 if((c
& 0xC0) != 0x80) goto error
;
161 c32
= (c32
<< 6) | (c
& 0x3F);
162 if(c32
< 0x0800 || (c32
>= 0xD800 && c32
<= 0xDFFF)) goto error
;
163 } else if(c
<= 0xF7) {
164 if(ns
< 3) goto error
;
167 if((c
& 0xC0) != 0x80) goto error
;
168 c32
= (c32
<< 6) | (c
& 0x3F);
170 if((c
& 0xC0) != 0x80) goto error
;
171 c32
= (c32
<< 6) | (c
& 0x3F);
173 if((c
& 0xC0) != 0x80) goto error
;
174 c32
= (c32
<< 6) | (c
& 0x3F);
175 if(c32
< 0x00010000 || c32
> 0x0010FFFF) goto error
;
178 dynstr_ucs4_append(&d
, c32
);
180 dynstr_ucs4_terminate(&d
);
190 /** @defgroup utf32 Functions that operate on UTF-32 strings */
193 /** @brief Return the length of a 0-terminated UTF-32 string
194 * @param s Pointer to 0-terminated string
195 * @return Length of string in code points (excluding terminator)
197 * Unlike the conversion functions no validity checking is done on the string.
199 size_t utf32_len(const uint32_t *s
) {
200 const uint32_t *t
= s
;
204 return (size_t)(t
- s
);
207 /** @brief Return the @ref unidata structure for code point @p c
209 * @p c can be any 32-bit value, a sensible value will be returned regardless.
211 static const struct unidata
*utf32__unidata(uint32_t c
) {
212 if(c
< UNICODE_NCHARS
)
213 return &unidata
[c
/ UNICODE_MODULUS
][c
% UNICODE_MODULUS
];
214 else if((c
>= 0xF0000 && c
<= 0xFFFFD)
215 || (c
>= 0x100000 && c
<= 0x10FFFD))
216 return utf32__unidata(0xE000); /* Co */
218 return utf32__unidata(0xFFFF); /* Cn */
221 /** @brief Return the combining class of @p c
222 * @param c Code point
223 * @return Combining class of @p c
225 static inline int utf32__combining_class(uint32_t c
) {
226 return utf32__unidata(c
)->ccc
;
229 /** @brief Stably sort [s,s+ns) into descending order of combining class
230 * @param s Start of array
231 * @param ns Number of elements, must be at least 1
232 * @param buffer Buffer of at least @p ns elements
234 static void utf32__sort_ccc(uint32_t *s
, size_t ns
, uint32_t *buffer
) {
235 uint32_t *a
, *b
, *bp
;
239 case 1: /* 1-element array is always sorted */
241 case 2: /* 2-element arrays are trivial to sort */
242 if(utf32__combining_class(s
[0]) > utf32__combining_class(s
[1])) {
249 /* Partition the array */
254 /* Sort the two halves of the array */
255 utf32__sort_ccc(a
, na
, buffer
);
256 utf32__sort_ccc(b
, nb
, buffer
);
257 /* Merge them back into one, via the buffer */
259 while(na
> 0 && nb
> 0) {
260 /* We want descending order of combining class (hence <)
261 * and we want stability within combining classes (hence <=)
263 if(utf32__combining_class(*a
) <= utf32__combining_class(*b
)) {
279 memcpy(s
, buffer
, ns
* sizeof(uint32_t));
284 /** @brief Put combining characters into canonical order
285 * @param s Pointer to UTF-32 string
286 * @param ns Length of @p s
287 * @return 0 on success, -1 on error
289 * @p s is modified in-place. See Unicode 5.0 s3.11 for details of the
292 * Currently we only support a maximum of 1024 combining characters after each
293 * base character. If this limit is exceeded then -1 is returned.
295 static int utf32__canonical_ordering(uint32_t *s
, size_t ns
) {
297 uint32_t buffer
[1024];
299 /* The ordering amounts to a stable sort of each contiguous group of
300 * characters with non-0 combining class. */
302 /* Skip non-combining characters */
303 if(utf32__combining_class(*s
) == 0) {
308 /* We must now have at least one combining character; see how many
310 for(nc
= 1; nc
< ns
&& utf32__combining_class(s
[nc
]) != 0; ++nc
)
315 utf32__sort_ccc(s
, nc
, buffer
);
322 /* Magic numbers from UAX #15 s16 */
330 #define NCount (VCount * TCount)
331 #define SCount (LCount * NCount)
333 /** @brief Guts of the decomposition lookup functions */
334 #define utf32__decompose_one_generic(WHICH) do { \
335 const uint32_t *dc = utf32__unidata(c)->WHICH; \
337 /* Found a canonical decomposition in the table */ \
339 utf32__decompose_one_##WHICH(d, *dc++); \
340 } else if(c >= SBase && c < SBase + SCount) { \
341 /* Mechanically decomposable Hangul syllable (UAX #15 s16) */ \
342 const uint32_t SIndex = c - SBase; \
343 const uint32_t L = LBase + SIndex / NCount; \
344 const uint32_t V = VBase + (SIndex % NCount) / TCount; \
345 const uint32_t T = TBase + SIndex % TCount; \
346 dynstr_ucs4_append(d, L); \
347 dynstr_ucs4_append(d, V); \
349 dynstr_ucs4_append(d, T); \
351 /* Equal to own canonical decomposition */ \
352 dynstr_ucs4_append(d, c); \
355 /** @brief Recursively compute the canonical decomposition of @p c
356 * @param d Dynamic string to store decomposition in
357 * @param c Code point to decompose (must be a valid!)
358 * @return 0 on success, -1 on error
360 static void utf32__decompose_one_canon(struct dynstr_ucs4
*d
, uint32_t c
) {
361 utf32__decompose_one_generic(canon
);
364 /** @brief Recursively compute the compatibility decomposition of @p c
365 * @param d Dynamic string to store decomposition in
366 * @param c Code point to decompose (must be a valid!)
367 * @return 0 on success, -1 on error
369 static void utf32__decompose_one_compat(struct dynstr_ucs4
*d
, uint32_t c
) {
370 utf32__decompose_one_generic(compat
);
373 /** @brief Guts of the decomposition functions */
374 #define utf32__decompose_generic(WHICH) do { \
375 struct dynstr_ucs4 d; \
378 dynstr_ucs4_init(&d); \
381 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
383 utf32__decompose_one_##WHICH(&d, c); \
386 if(utf32__canonical_ordering(d.vec, d.nvec)) \
388 dynstr_ucs4_terminate(&d); \
397 /** @brief Canonically decompose @p [s,s+ns)
398 * @param s Pointer to string
399 * @param ns Length of string
400 * @param ndp Where to store length of result
401 * @return Pointer to result string, or NULL
403 * Computes the canonical decomposition of a string and stably sorts combining
404 * characters into canonical order. The result is in Normalization Form D and
405 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
406 * NormalizationTest.txt.
408 * Returns NULL if the string is not valid for either of the following reasons:
409 * - it codes for a UTF-16 surrogate
410 * - it codes for a value outside the unicode code space
412 uint32_t *utf32_decompose_canon(const uint32_t *s
, size_t ns
, size_t *ndp
) {
413 utf32__decompose_generic(canon
);
416 /** @brief Compatibility decompose @p [s,s+ns)
417 * @param s Pointer to string
418 * @param ns Length of string
419 * @param ndp Where to store length of result
420 * @return Pointer to result string, or NULL
422 * Computes the compatibility decomposition of a string and stably sorts
423 * combining characters into canonical order. The result is in Normalization
424 * Form KD and (at the time of writing!) passes the NFKD tests defined in
425 * Unicode 5.0's NormalizationTest.txt.
427 * Returns NULL if the string is not valid for either of the following reasons:
428 * - it codes for a UTF-16 surrogate
429 * - it codes for a value outside the unicode code space
431 uint32_t *utf32_decompose_compat(const uint32_t *s
, size_t ns
, size_t *ndp
) {
432 utf32__decompose_generic(compat
);
435 /** @brief Single-character case-fold and decompose operation */
436 #define utf32__casefold_one(WHICH) do { \
437 const uint32_t *cf = utf32__unidata(c)->casefold; \
439 /* Found a case-fold mapping in the table */ \
441 utf32__decompose_one_##WHICH(&d, *cf++); \
443 utf32__decompose_one_##WHICH(&d, c); \
446 /** @brief Case-fold @p [s,s+ns)
447 * @param s Pointer to string
448 * @param ns Length of string
449 * @param ndp Where to store length of result
450 * @return Pointer to result string, or NULL
452 * Case-fold the string at @p s according to full default case-folding rules
453 * (s3.13) for caseless matching. The result will be in NFD.
455 * Returns NULL if the string is not valid for either of the following reasons:
456 * - it codes for a UTF-16 surrogate
457 * - it codes for a value outside the unicode code space
459 uint32_t *utf32_casefold_canon(const uint32_t *s
, size_t ns
, size_t *ndp
) {
460 struct dynstr_ucs4 d
;
465 /* If the canonical decomposition of the string includes any combining
466 * character that case-folds to a non-combining character then we must
467 * normalize before we fold. In Unicode 5.0.0 this means 0345 COMBINING
468 * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
469 * canonically decompose to it. */
470 for(n
= 0; n
< ns
; ++n
)
471 if(utf32__unidata(s
[n
])->flags
& unicode_normalize_before_casefold
)
474 /* We need a preliminary decomposition */
475 if(!(ss
= utf32_decompose_canon(s
, ns
, &ns
)))
479 dynstr_ucs4_init(&d
);
482 if((c
>= 0xD800 && c
<= 0xDFFF) || c
> 0x10FFFF)
484 utf32__casefold_one(canon
);
487 if(utf32__canonical_ordering(d
.vec
, d
.nvec
))
489 dynstr_ucs4_terminate(&d
);
499 /** @brief Compatibilit case-fold @p [s,s+ns)
500 * @param s Pointer to string
501 * @param ns Length of string
502 * @param ndp Where to store length of result
503 * @return Pointer to result string, or NULL
505 * Case-fold the string at @p s according to full default case-folding rules
506 * (s3.13) for compatibility caseless matching. The result will be in NFKD.
508 * Returns NULL if the string is not valid for either of the following reasons:
509 * - it codes for a UTF-16 surrogate
510 * - it codes for a value outside the unicode code space
512 uint32_t *utf32_casefold_compat(const uint32_t *s
, size_t ns
, size_t *ndp
) {
513 struct dynstr_ucs4 d
;
518 for(n
= 0; n
< ns
; ++n
)
519 if(utf32__unidata(s
[n
])->flags
& unicode_normalize_before_casefold
)
522 /* We need a preliminary _canonical_ decomposition */
523 if(!(ss
= utf32_decompose_canon(s
, ns
, &ns
)))
527 /* This computes NFKD(toCaseFold(s)) */
528 #define compat_casefold_middle() do { \
529 dynstr_ucs4_init(&d); \
532 if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF) \
534 utf32__casefold_one(compat); \
537 if(utf32__canonical_ordering(d.vec, d.nvec)) \
540 /* Do the inner (NFKD o toCaseFold) */
541 compat_casefold_middle();
542 /* We can do away with the NFD'd copy of the input now */
546 /* Do the outer (NFKD o toCaseFold) */
547 compat_casefold_middle();
549 dynstr_ucs4_terminate(&d
);
559 /** @brief Order a pair of UTF-32 strings
560 * @param a First 0-terminated string
561 * @param b Second 0-terminated string
562 * @return -1, 0 or 1 for a less than, equal to or greater than b
564 * "Comparable to strcmp() at its best."
566 int utf32_cmp(const uint32_t *a
, const uint32_t *b
) {
567 while(*a
&& *b
&& *a
== *b
) {
571 return *a
< *b ?
-1 : (*a
> *b ?
1 : 0);
574 /** @brief Return the General_Category value for @p c
576 * @return General_Category property value
578 static inline enum unicode_General_Category
utf32__general_category(uint32_t c
) {
579 return utf32__unidata(c
)->general_category
;
582 /** @brief Check Grapheme_Cluster_Break property
583 * @param c Code point
584 * @return 0 if it is as described, 1 otherwise
586 static int utf32__is_control_or_cr_or_lf(uint32_t c
) {
587 switch(utf32__general_category(c
)) {
590 case unicode_General_Category_Zl
:
591 case unicode_General_Category_Zp
:
592 case unicode_General_Category_Cc
:
594 case unicode_General_Category_Cf
:
595 if(c
== 0x200C || c
== 0x200D)
601 #define Hangul_Syllable_Type_NA 0
602 #define Hangul_Syllable_Type_L 0x1100
603 #define Hangul_Syllable_Type_V 0x1160
604 #define Hangul_Syllable_Type_T 0x11A8
605 #define Hangul_Syllable_Type_LV 0xAC00
606 #define Hangul_Syllable_Type_LVT 0xAC01
608 /** @brief Determine Hangul_Syllable_Type of @p c
609 * @param c Code point
610 * @return Equivalance class of @p c, or Hangul_Syllable_Type_NA
612 * If this is a Hangul character then a representative member of its
613 * equivalence class is returned. Otherwise Hangul_Syllable_Type_NA is
616 static uint32_t utf32__hangul_syllable_type(uint32_t c
) {
617 /* Dispose of the bulk of the non-Hangul code points first */
618 if(c
< 0x1100) return Hangul_Syllable_Type_NA
;
619 if(c
> 0x1200 && c
< 0xAC00) return Hangul_Syllable_Type_NA
;
620 if(c
>= 0xD800) return Hangul_Syllable_Type_NA
;
621 /* Now we pick out the assigned Hangul code points */
622 if((c
>= 0x1100 && c
<= 0x1159) || c
== 0x115F) return Hangul_Syllable_Type_L
;
623 if(c
>= 0x1160 && c
<= 0x11A2) return Hangul_Syllable_Type_V
;
624 if(c
>= 0x11A8 && c
<= 0x11F9) return Hangul_Syllable_Type_T
;
625 if(c
>= 0xAC00 && c
<= 0xD7A3) {
627 return Hangul_Syllable_Type_LV
;
629 return Hangul_Syllable_Type_LVT
;
631 return Hangul_Syllable_Type_NA
;
634 /** @brief Determine Word_Break property
635 * @param c Code point
636 * @return Word_Break property value of @p c
638 static enum unicode_Word_Break
utf32__word_break(uint32_t c
) {
639 if(c
< 0xAC00 || c
> 0xD7A3)
640 return utf32__unidata(c
)->word_break
;
642 return unicode_Word_Break_ALetter
;
645 /** @brief Identify a grapheme cluster boundary
646 * @param s Start of string (must be NFD)
647 * @param ns Length of string
648 * @param n Index within string (in [0,ns].)
649 * @return 1 at a grapheme cluster boundary, 0 otherwise
651 * This function identifies default grapheme cluster boundaries as described in
652 * UAX #29 s3. It returns 1 if @p n points at the code point just after a
653 * grapheme cluster boundary (including the hypothetical code point just after
654 * the end of the string).
656 int utf32_is_gcb(const uint32_t *s
, size_t ns
, size_t n
) {
657 uint32_t before
, after
;
658 uint32_t hbefore
, hafter
;
660 if(n
== 0 || n
== ns
)
662 /* Now we know that s[n-1] and s[n] are safe to inspect */
666 if(before
== 0x000D && after
== 0x000A)
669 if(utf32__is_control_or_cr_or_lf(before
)
670 || utf32__is_control_or_cr_or_lf(after
))
672 hbefore
= utf32__hangul_syllable_type(before
);
673 hafter
= utf32__hangul_syllable_type(after
);
675 if(hbefore
== Hangul_Syllable_Type_L
676 && (hafter
== Hangul_Syllable_Type_L
677 || hafter
== Hangul_Syllable_Type_V
678 || hafter
== Hangul_Syllable_Type_LV
679 || hafter
== Hangul_Syllable_Type_LVT
))
682 if((hbefore
== Hangul_Syllable_Type_LV
683 || hbefore
== Hangul_Syllable_Type_V
)
684 && (hafter
== Hangul_Syllable_Type_V
685 || hafter
== Hangul_Syllable_Type_T
))
688 if((hbefore
== Hangul_Syllable_Type_LVT
689 || hbefore
== Hangul_Syllable_Type_T
)
690 && hafter
== Hangul_Syllable_Type_T
)
693 if(utf32__word_break(after
) == unicode_Word_Break_Extend
)
699 /** @brief Return true if @p c is ignorable for boundary specifications */
700 static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb
) {
701 return (wb
== unicode_Word_Break_Extend
702 || wb
== unicode_Word_Break_Format
);
705 /** @brief Identify a word boundary
706 * @param s Start of string (must be NFD)
707 * @param ns Length of string
708 * @param n Index within string (in [0,ns].)
709 * @return 1 at a word boundary, 0 otherwise
711 * This function identifies default word boundaries as described in UAX #29 s4.
712 * It returns 1 if @p n points at the code point just after a word boundary
713 * (including the hypothetical code point just after the end of the string).
715 int utf32_is_word_boundary(const uint32_t *s
, size_t ns
, size_t n
) {
716 enum unicode_Word_Break twobefore
, before
, after
, twoafter
;
720 if(n
== 0 || n
== ns
)
723 if(s
[n
-1] == 0x000D && s
[n
] == 0x000A)
726 /* (!Sep) x (Extend|Format) as in UAX #29 s6.2 */
727 switch(s
[n
-1]) { /* bit of a bodge */
735 if(utf32__boundary_ignorable(utf32__word_break(s
[n
])))
739 /* Gather the property values we'll need for the rest of the test taking the
740 * s6.2 changes into account */
741 /* First we look at the code points after the proposed boundary */
743 after
= utf32__word_break(s
[nn
++]);
744 if(!utf32__boundary_ignorable(after
)) {
745 /* X (Extend|Format)* -> X */
746 while(nn
< ns
&& utf32__boundary_ignorable(utf32__word_break(s
[nn
])))
749 /* It's possible now that nn=ns */
751 twoafter
= utf32__word_break(s
[nn
]);
753 twoafter
= unicode_Word_Break_Other
;
755 /* Next we look at the code points before the proposed boundary. This is a
758 while(nn
> 0 && utf32__boundary_ignorable(utf32__word_break(s
[nn
- 1])))
761 /* s[nn] must be ignorable */
762 before
= utf32__word_break(s
[nn
]);
763 twobefore
= unicode_Word_Break_Other
;
765 /* s[nn] is ignorable or after the proposed boundary; but s[nn-1] is not
767 before
= utf32__word_break(s
[nn
- 1]);
769 /* Repeat the exercise */
770 while(nn
> 0 && utf32__boundary_ignorable(utf32__word_break(s
[nn
- 1])))
773 twobefore
= utf32__word_break(s
[nn
]);
775 twobefore
= utf32__word_break(s
[nn
- 1]);
779 if(before
== unicode_Word_Break_ALetter
780 && after
== unicode_Word_Break_ALetter
)
783 if(before
== unicode_Word_Break_ALetter
784 && after
== unicode_Word_Break_MidLetter
785 && twoafter
== unicode_Word_Break_ALetter
)
788 if(twobefore
== unicode_Word_Break_ALetter
789 && before
== unicode_Word_Break_MidLetter
790 && after
== unicode_Word_Break_ALetter
)
793 if(before
== unicode_Word_Break_Numeric
794 && after
== unicode_Word_Break_Numeric
)
797 if(before
== unicode_Word_Break_ALetter
798 && after
== unicode_Word_Break_Numeric
)
801 if(before
== unicode_Word_Break_Numeric
802 && after
== unicode_Word_Break_ALetter
)
805 if(twobefore
== unicode_Word_Break_Numeric
806 && before
== unicode_Word_Break_MidNum
807 && after
== unicode_Word_Break_Numeric
)
810 if(before
== unicode_Word_Break_Numeric
811 && after
== unicode_Word_Break_MidNum
812 && twoafter
== unicode_Word_Break_Numeric
)
815 if(before
== unicode_Word_Break_Katakana
816 && after
== unicode_Word_Break_Katakana
)
819 if((before
== unicode_Word_Break_ALetter
820 || before
== unicode_Word_Break_Numeric
821 || before
== unicode_Word_Break_Katakana
822 || before
== unicode_Word_Break_ExtendNumLet
)
823 && after
== unicode_Word_Break_ExtendNumLet
)
826 if(before
== unicode_Word_Break_ExtendNumLet
827 && (after
== unicode_Word_Break_ALetter
828 || after
== unicode_Word_Break_Numeric
829 || after
== unicode_Word_Break_Katakana
))
836 /** @defgroup utf8 Functions that operate on UTF-8 strings */
839 /** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
840 #define utf8__transform(FN) do { \
841 uint32_t *to32 = 0, *decomp32 = 0; \
842 size_t nto32, ndecomp32; \
845 if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error; \
846 if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error; \
847 decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp); \
854 /** @brief Canonically decompose @p [s,s+ns)
855 * @param s Pointer to string
856 * @param ns Length of string
857 * @param ndp Where to store length of result
858 * @return Pointer to result string, or NULL
860 * Computes the canonical decomposition of a string and stably sorts combining
861 * characters into canonical order. The result is in Normalization Form D and
862 * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
863 * NormalizationTest.txt.
865 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
868 * See also utf32_decompose_canon().
870 char *utf8_decompose_canon(const char *s
, size_t ns
, size_t *ndp
) {
871 utf8__transform(utf32_decompose_canon
);
874 /** @brief Compatibility decompose @p [s,s+ns)
875 * @param s Pointer to string
876 * @param ns Length of string
877 * @param ndp Where to store length of result
878 * @return Pointer to result string, or NULL
880 * Computes the compatibility decomposition of a string and stably sorts
881 * combining characters into canonical order. The result is in Normalization
882 * Form KD and (at the time of writing!) passes the NFKD tests defined in
883 * Unicode 5.0's NormalizationTest.txt.
885 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
888 * See also utf32_decompose_compat().
890 char *utf8_decompose_compat(const char *s
, size_t ns
, size_t *ndp
) {
891 utf8__transform(utf32_decompose_compat
);
894 /** @brief Case-fold @p [s,s+ns)
895 * @param s Pointer to string
896 * @param ns Length of string
897 * @param ndp Where to store length of result
898 * @return Pointer to result string, or NULL
900 * Case-fold the string at @p s according to full default case-folding rules
901 * (s3.13). The result will be in NFD.
903 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
906 char *utf8_casefold_canon(const char *s
, size_t ns
, size_t *ndp
) {
907 utf8__transform(utf32_casefold_canon
);
910 /** @brief Compatibility case-fold @p [s,s+ns)
911 * @param s Pointer to string
912 * @param ns Length of string
913 * @param ndp Where to store length of result
914 * @return Pointer to result string, or NULL
916 * Case-fold the string at @p s according to full default case-folding rules
917 * (s3.13). The result will be in NFKD.
919 * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
922 char *utf8_casefold_compat(const char *s
, size_t ns
, size_t *ndp
) {
923 utf8__transform(utf32_casefold_compat
);