mdw@git.distorted.org.uk Git - disorder/blob - lib/unicode.c

   1 /*
   2  * This file is part of DisOrder
   3  * Copyright (C) 2007 Richard Kettlewell
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  18  * USA
  19  */
  20 /** @file lib/unicode.c
  21  * @brief Unicode support functions
  22  *
  23  * Here by UTF-8 and UTF-8 we mean the encoding forms of those names (not the
  24  * encoding schemes).  The primary encoding form is UTF-32 but convenience
  25  * wrappers using UTF-8 are provided for a number of functions.
  26  *
  27  * The idea is that all the strings that hit the database will be in a
  28  * particular normalization form, and for the search and tags database
  29  * in case-folded form, so they can be naively compared within the
  30  * database code.
  31  *
  32  * As the code stands this guarantee is not well met!
  33  */
  34
  35 #include <config.h>
  36 #include "types.h"
  37
  38 #include <string.h>
  39 #include <stdio.h>              /* TODO */
  40
  41 #include "mem.h"
  42 #include "vector.h"
  43 #include "unicode.h"
  44 #include "unidata.h"
  45
  46 /** @defgroup utftransform Functions that transform between different Unicode encoding forms */
  47 /*@{*/
  48
  49 /** @brief Convert UTF-32 to UTF-8
  50  * @param s Source string
  51  * @param ns Length of source string in code points
  52  * @param ndp Where to store length of destination string (or NULL)
  53  * @return Newly allocated destination string or NULL on error
  54  *
  55  * If the UTF-32 is not valid then NULL is returned.  A UTF-32 code point is
  56  * invalid if:
  57  * - it codes for a UTF-16 surrogate
  58  * - it codes for a value outside the unicode code space
  59  *
  60  * The return value is always 0-terminated.  The value returned via @p *ndp
  61  * does not include the terminator.
  62  */
  63 char *utf32_to_utf8(const uint32_t *s, size_t ns, size_t *ndp) {
  64   struct dynstr d;
  65   uint32_t c;
  66
  67   dynstr_init(&d);
  68   while(ns > 0) {
  69     c = *s++;
  70     if(c < 0x80)
  71       dynstr_append(&d, c);
  72     else if(c < 0x0800) {
  73       dynstr_append(&d, 0xC0 | (c >> 6));
  74       dynstr_append(&d, 0x80 | (c & 0x3F));
  75     } else if(c < 0x10000) {
  76       if(c >= 0xD800 && c <= 0xDFFF)
  77         goto error;
  78       dynstr_append(&d, 0xE0 | (c >> 12));
  79       dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
  80       dynstr_append(&d, 0x80 | (c & 0x3F));
  81     } else if(c < 0x110000) {
  82       dynstr_append(&d, 0xF0 | (c >> 18));
  83       dynstr_append(&d, 0x80 | ((c >> 12) & 0x3F));
  84       dynstr_append(&d, 0x80 | ((c >> 6) & 0x3F));
  85       dynstr_append(&d, 0x80 | (c & 0x3F));
  86     } else
  87       goto error;
  88     --ns;
  89   }
  90   dynstr_terminate(&d);
  91   if(ndp)
  92     *ndp = d.nvec;
  93   return d.vec;
  94 error:
  95   xfree(d.vec);
  96   return 0;
  97 }
  98
  99 /** @brief Convert UTF-8 to UTF-32
 100  * @param s Source string
 101  * @param ns Length of source string in code points
 102  * @param ndp Where to store length of destination string (or NULL)
 103  * @return Newly allocated destination string or NULL
 104  *
 105  * The return value is always 0-terminated.  The value returned via @p *ndp
 106  * does not include the terminator.
 107  *
 108  * If the UTF-8 is not valid then NULL is returned.  A UTF-8 sequence
 109  * for a code point is invalid if:
 110  * - it is not the shortest possible sequence for the code point
 111  * - it codes for a UTF-16 surrogate
 112  * - it codes for a value outside the unicode code space
 113  */
 114 uint32_t *utf8_to_utf32(const char *s, size_t ns, size_t *ndp) {
 115   struct dynstr_ucs4 d;
 116   uint32_t c32, c;
 117   const uint8_t *ss = (const uint8_t *)s;
 118
 119   dynstr_ucs4_init(&d);
 120   while(ns > 0) {
 121     c = *ss++;
 122     --ns;
 123     /* Acceptable UTF-8 is that which codes for Unicode Scalar Values
 124      * (Unicode 5.0.0 s3.9 D76)
 125      *
 126      * 0xxxxxxx
 127      * 7 data bits gives 0x00 - 0x7F and all are acceptable
 128      *
 129      * 110xxxxx 10xxxxxx
 130      * 11 data bits gives 0x0000 - 0x07FF but only 0x0080 - 0x07FF acceptable
 131      *
 132      * 1110xxxx 10xxxxxx 10xxxxxx
 133      * 16 data bits gives 0x0000 - 0xFFFF but only 0x0800 - 0xFFFF acceptable
 134      * (and UTF-16 surrogates are not acceptable)
 135      *
 136      * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 137      * 21 data bits gives 0x00000000 - 0x001FFFFF
 138      * but only           0x00010000 - 0x0010FFFF are acceptable
 139      *
 140      * It is NOT always the case that the data bits in the first byte are
 141      * always non-0 for the acceptable values, so we do a separate check after
 142      * decoding.
 143      */
 144     if(c < 0x80)
 145       c32 = c;
 146     else if(c <= 0xDF) {
 147       if(ns < 1) goto error;
 148       c32 = c & 0x1F;
 149       c = *ss++;
 150       if((c & 0xC0) != 0x80) goto error;
 151       c32 = (c32 << 6) | (c & 0x3F);
 152       if(c32 < 0x80) goto error;
 153     } else if(c <= 0xEF) {
 154       if(ns < 2) goto error;
 155       c32 = c & 0x0F;
 156       c = *ss++;
 157       if((c & 0xC0) != 0x80) goto error;
 158       c32 = (c32 << 6) | (c & 0x3F);
 159       c = *ss++;
 160       if((c & 0xC0) != 0x80) goto error;
 161       c32 = (c32 << 6) | (c & 0x3F);
 162       if(c32 < 0x0800 || (c32 >= 0xD800 && c32 <= 0xDFFF)) goto error;
 163     } else if(c <= 0xF7) {
 164       if(ns < 3) goto error;
 165       c32 = c & 0x07;
 166       c = *ss++;
 167       if((c & 0xC0) != 0x80) goto error;
 168       c32 = (c32 << 6) | (c & 0x3F);
 169       c = *ss++;
 170       if((c & 0xC0) != 0x80) goto error;
 171       c32 = (c32 << 6) | (c & 0x3F);
 172       c = *ss++;
 173       if((c & 0xC0) != 0x80) goto error;
 174       c32 = (c32 << 6) | (c & 0x3F);
 175       if(c32 < 0x00010000 || c32 > 0x0010FFFF) goto error;
 176     } else
 177       goto error;
 178     dynstr_ucs4_append(&d, c32);
 179   }
 180   dynstr_ucs4_terminate(&d);
 181   if(ndp)
 182     *ndp = d.nvec;
 183   return d.vec;
 184 error:
 185   xfree(d.vec);
 186   return 0;
 187 }
 188
 189 /*@}*/
 190 /** @defgroup utf32 Functions that operate on UTF-32 strings */
 191 /*@{*/
 192
 193 /** @brief Return the length of a 0-terminated UTF-32 string
 194  * @param s Pointer to 0-terminated string
 195  * @return Length of string in code points (excluding terminator)
 196  *
 197  * Unlike the conversion functions no validity checking is done on the string.
 198  */
 199 size_t utf32_len(const uint32_t *s) {
 200   const uint32_t *t = s;
 201
 202   while(*t)
 203     ++t;
 204   return (size_t)(t - s);
 205 }
 206
 207 /** @brief Return the @ref unidata structure for code point @p c
 208  *
 209  * @p c can be any 32-bit value, a sensible value will be returned regardless.
 210  */
 211 static const struct unidata *utf32__unidata(uint32_t c) {
 212   /* The bottom half of the table contains almost everything of interest
 213    * and we can just return the right thing straight away */
 214   if(c < UNICODE_BREAK_START)
 215     return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
 216   /* Within the break everything is unassigned */
 217   if(c < UNICODE_BREAK_END)
 218     return utf32__unidata(0xFFFF);      /* guaranteed to be Cn */
 219   /* Planes 15 and 16 are (mostly) private use */
 220   if((c >= 0xF0000 && c <= 0xFFFFD)
 221      || (c >= 0x100000 && c <= 0x10FFFD))
 222     return utf32__unidata(0xE000);      /* first Co code point */
 223   /* Everything else above the break top is unassigned */
 224   if(c >= UNICODE_BREAK_TOP)
 225     return utf32__unidata(0xFFFF);      /* guaranteed to be Cn */
 226   /* Currently the rest is language tags and variation selectors */
 227   c -= (UNICODE_BREAK_END - UNICODE_BREAK_START);
 228   return &unidata[c / UNICODE_MODULUS][c % UNICODE_MODULUS];
 229 }
 230
 231 /** @brief Return the combining class of @p c
 232  * @param c Code point
 233  * @return Combining class of @p c
 234  */
 235 static inline int utf32__combining_class(uint32_t c) {
 236   return utf32__unidata(c)->ccc;
 237 }
 238
 239 /** @brief Stably sort [s,s+ns) into descending order of combining class
 240  * @param s Start of array
 241  * @param ns Number of elements, must be at least 1
 242  * @param buffer Buffer of at least @p ns elements
 243  */
 244 static void utf32__sort_ccc(uint32_t *s, size_t ns, uint32_t *buffer) {
 245   uint32_t *a, *b, *bp;
 246   size_t na, nb;
 247
 248   switch(ns) {
 249   case 1:                       /* 1-element array is always sorted */
 250     return;
 251   case 2:                       /* 2-element arrays are trivial to sort */
 252     if(utf32__combining_class(s[0]) > utf32__combining_class(s[1])) {
 253       uint32_t tmp = s[0];
 254       s[0] = s[1];
 255       s[1] = tmp;
 256     }
 257     return;
 258   default:
 259     /* Partition the array */
 260     na = ns / 2;
 261     nb = ns - na;
 262     a = s;
 263     b = s + na;
 264     /* Sort the two halves of the array */
 265     utf32__sort_ccc(a, na, buffer);
 266     utf32__sort_ccc(b, nb, buffer);
 267     /* Merge them back into one, via the buffer */
 268     bp = buffer;
 269     while(na > 0 && nb > 0) {
 270       /* We want descending order of combining class (hence <)
 271        * and we want stability within combining classes (hence <=)
 272        */
 273       if(utf32__combining_class(*a) <= utf32__combining_class(*b)) {
 274         *bp++ = *a++;
 275         --na;
 276       } else {
 277         *bp++ = *b++;
 278         --nb;
 279       }
 280     }
 281     while(na > 0) {
 282       *bp++ = *a++;
 283       --na;
 284     }
 285     while(nb > 0) {
 286       *bp++ = *b++;
 287       --nb;
 288     }
 289     memcpy(s, buffer,  ns * sizeof(uint32_t));
 290     return;
 291   }
 292 }
 293
 294 /** @brief Put combining characters into canonical order
 295  * @param s Pointer to UTF-32 string
 296  * @param ns Length of @p s
 297  * @return 0 on success, -1 on error
 298  *
 299  * @p s is modified in-place.  See Unicode 5.0 s3.11 for details of the
 300  * ordering.
 301  *
 302  * Currently we only support a maximum of 1024 combining characters after each
 303  * base character.  If this limit is exceeded then -1 is returned.
 304  */
 305 static int utf32__canonical_ordering(uint32_t *s, size_t ns) {
 306   size_t nc;
 307   uint32_t buffer[1024];
 308
 309   /* The ordering amounts to a stable sort of each contiguous group of
 310    * characters with non-0 combining class. */
 311   while(ns > 0) {
 312     /* Skip non-combining characters */
 313     if(utf32__combining_class(*s) == 0) {
 314       ++s;
 315       --ns;
 316       continue;
 317     }
 318     /* We must now have at least one combining character; see how many
 319      * there are */
 320     for(nc = 1; nc < ns && utf32__combining_class(s[nc]) != 0; ++nc)
 321       ;
 322     if(nc > 1024)
 323       return -1;
 324     /* Sort the array */
 325     utf32__sort_ccc(s, nc, buffer);
 326     s += nc;
 327     ns -= nc;
 328   }
 329   return 0;
 330 }
 331
 332 /* Magic numbers from UAX #15 s16 */
 333 #define SBase 0xAC00
 334 #define LBase 0x1100
 335 #define VBase 0x1161
 336 #define TBase 0x11A7
 337 #define LCount 19
 338 #define VCount 21
 339 #define TCount 28
 340 #define NCount (VCount * TCount)
 341 #define SCount (LCount * NCount)
 342
 343 /** @brief Guts of the decomposition lookup functions */
 344 #define utf32__decompose_one_generic(WHICH) do {                        \
 345   const uint32_t *dc = utf32__unidata(c)->WHICH;                        \
 346   if(dc) {                                                              \
 347     /* Found a canonical decomposition in the table */                  \
 348     while(*dc)                                                          \
 349       utf32__decompose_one_##WHICH(d, *dc++);                           \
 350   } else if(c >= SBase && c < SBase + SCount) {                         \
 351     /* Mechanically decomposable Hangul syllable (UAX #15 s16) */       \
 352     const uint32_t SIndex = c - SBase;                                  \
 353     const uint32_t L = LBase + SIndex / NCount;                         \
 354     const uint32_t V = VBase + (SIndex % NCount) / TCount;              \
 355     const uint32_t T = TBase + SIndex % TCount;                         \
 356     dynstr_ucs4_append(d, L);                                           \
 357     dynstr_ucs4_append(d, V);                                           \
 358     if(T != TBase)                                                      \
 359       dynstr_ucs4_append(d, T);                                         \
 360   } else                                                                \
 361     /* Equal to own canonical decomposition */                          \
 362     dynstr_ucs4_append(d, c);                                           \
 363 } while(0)
 364
 365 /** @brief Recursively compute the canonical decomposition of @p c
 366  * @param d Dynamic string to store decomposition in
 367  * @param c Code point to decompose (must be a valid!)
 368  * @return 0 on success, -1 on error
 369  */
 370 static void utf32__decompose_one_canon(struct dynstr_ucs4 *d, uint32_t c) {
 371   utf32__decompose_one_generic(canon);
 372 }
 373
 374 /** @brief Recursively compute the compatibility decomposition of @p c
 375  * @param d Dynamic string to store decomposition in
 376  * @param c Code point to decompose (must be a valid!)
 377  * @return 0 on success, -1 on error
 378  */
 379 static void utf32__decompose_one_compat(struct dynstr_ucs4 *d, uint32_t c) {
 380   utf32__decompose_one_generic(compat);
 381 }
 382
 383 /** @brief Guts of the decomposition functions */
 384 #define utf32__decompose_generic(WHICH) do {            \
 385   struct dynstr_ucs4 d;                                 \
 386   uint32_t c;                                           \
 387                                                         \
 388   dynstr_ucs4_init(&d);                                 \
 389   while(ns) {                                           \
 390     c = *s++;                                           \
 391     if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)    \
 392       goto error;                                       \
 393     utf32__decompose_one_##WHICH(&d, c);                \
 394     --ns;                                               \
 395   }                                                     \
 396   if(utf32__canonical_ordering(d.vec, d.nvec))          \
 397     goto error;                                         \
 398   dynstr_ucs4_terminate(&d);                            \
 399   if(ndp)                                               \
 400     *ndp = d.nvec;                                      \
 401   return d.vec;                                         \
 402 error:                                                  \
 403   xfree(d.vec);                                         \
 404   return 0;                                             \
 405 } while(0)
 406
 407 /** @brief Canonically decompose @p [s,s+ns)
 408  * @param s Pointer to string
 409  * @param ns Length of string
 410  * @param ndp Where to store length of result
 411  * @return Pointer to result string, or NULL
 412  *
 413  * Computes the canonical decomposition of a string and stably sorts combining
 414  * characters into canonical order.  The result is in Normalization Form D and
 415  * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
 416  * NormalizationTest.txt.
 417  *
 418  * Returns NULL if the string is not valid for either of the following reasons:
 419  * - it codes for a UTF-16 surrogate
 420  * - it codes for a value outside the unicode code space
 421  */
 422 uint32_t *utf32_decompose_canon(const uint32_t *s, size_t ns, size_t *ndp) {
 423   utf32__decompose_generic(canon);
 424 }
 425
 426 /** @brief Compatibility decompose @p [s,s+ns)
 427  * @param s Pointer to string
 428  * @param ns Length of string
 429  * @param ndp Where to store length of result
 430  * @return Pointer to result string, or NULL
 431  *
 432  * Computes the compatibility decomposition of a string and stably sorts
 433  * combining characters into canonical order.  The result is in Normalization
 434  * Form KD and (at the time of writing!) passes the NFKD tests defined in
 435  * Unicode 5.0's NormalizationTest.txt.
 436  *
 437  * Returns NULL if the string is not valid for either of the following reasons:
 438  * - it codes for a UTF-16 surrogate
 439  * - it codes for a value outside the unicode code space
 440  */
 441 uint32_t *utf32_decompose_compat(const uint32_t *s, size_t ns, size_t *ndp) {
 442   utf32__decompose_generic(compat);
 443 }
 444
 445 /** @brief Single-character case-fold and decompose operation */
 446 #define utf32__casefold_one(WHICH) do {                                 \
 447   const uint32_t *cf = utf32__unidata(c)->casefold;                     \
 448   if(cf) {                                                              \
 449     /* Found a case-fold mapping in the table */                        \
 450     while(*cf)                                                          \
 451       utf32__decompose_one_##WHICH(&d, *cf++);                          \
 452   } else                                                                \
 453     utf32__decompose_one_##WHICH(&d, c);                                \
 454 } while(0)
 455
 456 /** @brief Case-fold @p [s,s+ns)
 457  * @param s Pointer to string
 458  * @param ns Length of string
 459  * @param ndp Where to store length of result
 460  * @return Pointer to result string, or NULL
 461  *
 462  * Case-fold the string at @p s according to full default case-folding rules
 463  * (s3.13) for caseless matching.  The result will be in NFD.
 464  *
 465  * Returns NULL if the string is not valid for either of the following reasons:
 466  * - it codes for a UTF-16 surrogate
 467  * - it codes for a value outside the unicode code space
 468  */
 469 uint32_t *utf32_casefold_canon(const uint32_t *s, size_t ns, size_t *ndp) {
 470   struct dynstr_ucs4 d;
 471   uint32_t c;
 472   size_t n;
 473   uint32_t *ss = 0;
 474
 475   /* If the canonical decomposition of the string includes any combining
 476    * character that case-folds to a non-combining character then we must
 477    * normalize before we fold.  In Unicode 5.0.0 this means 0345 COMBINING
 478    * GREEK YPOGEGRAMMENI in its decomposition and the various characters that
 479    * canonically decompose to it. */
 480   for(n = 0; n < ns; ++n)
 481     if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
 482       break;
 483   if(n < ns) {
 484     /* We need a preliminary decomposition */
 485     if(!(ss = utf32_decompose_canon(s, ns, &ns)))
 486       return 0;
 487     s = ss;
 488   }
 489   dynstr_ucs4_init(&d);
 490   while(ns) {
 491     c = *s++;
 492     if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)
 493       goto error;
 494     utf32__casefold_one(canon);
 495     --ns;
 496   }
 497   if(utf32__canonical_ordering(d.vec, d.nvec))
 498     goto error;
 499   dynstr_ucs4_terminate(&d);
 500   if(ndp)
 501     *ndp = d.nvec;
 502   return d.vec;
 503 error:
 504   xfree(d.vec);
 505   xfree(ss);
 506   return 0;
 507 }
 508
 509 /** @brief Compatibilit case-fold @p [s,s+ns)
 510  * @param s Pointer to string
 511  * @param ns Length of string
 512  * @param ndp Where to store length of result
 513  * @return Pointer to result string, or NULL
 514  *
 515  * Case-fold the string at @p s according to full default case-folding rules
 516  * (s3.13) for compatibility caseless matching.  The result will be in NFKD.
 517  *
 518  * Returns NULL if the string is not valid for either of the following reasons:
 519  * - it codes for a UTF-16 surrogate
 520  * - it codes for a value outside the unicode code space
 521  */
 522 uint32_t *utf32_casefold_compat(const uint32_t *s, size_t ns, size_t *ndp) {
 523   struct dynstr_ucs4 d;
 524   uint32_t c;
 525   size_t n;
 526   uint32_t *ss = 0;
 527
 528   for(n = 0; n < ns; ++n)
 529     if(utf32__unidata(s[n])->flags & unicode_normalize_before_casefold)
 530       break;
 531   if(n < ns) {
 532     /* We need a preliminary _canonical_ decomposition */
 533     if(!(ss = utf32_decompose_canon(s, ns, &ns)))
 534       return 0;
 535     s = ss;
 536   }
 537   /* This computes NFKD(toCaseFold(s)) */
 538 #define compat_casefold_middle() do {                   \
 539   dynstr_ucs4_init(&d);                                 \
 540   while(ns) {                                           \
 541     c = *s++;                                           \
 542     if((c >= 0xD800 && c <= 0xDFFF) || c > 0x10FFFF)    \
 543       goto error;                                       \
 544     utf32__casefold_one(compat);                        \
 545     --ns;                                               \
 546   }                                                     \
 547   if(utf32__canonical_ordering(d.vec, d.nvec))          \
 548     goto error;                                         \
 549 } while(0)
 550   /* Do the inner (NFKD o toCaseFold) */
 551   compat_casefold_middle();
 552   /* We can do away with the NFD'd copy of the input now */
 553   xfree(ss);
 554   s = ss = d.vec;
 555   ns = d.nvec;
 556   /* Do the outer (NFKD o toCaseFold) */
 557   compat_casefold_middle();
 558   /* That's all */
 559   dynstr_ucs4_terminate(&d);
 560   if(ndp)
 561     *ndp = d.nvec;
 562   return d.vec;
 563 error:
 564   xfree(d.vec);
 565   xfree(ss);
 566   return 0;
 567 }
 568
 569 /** @brief Order a pair of UTF-32 strings
 570  * @param a First 0-terminated string
 571  * @param b Second 0-terminated string
 572  * @return -1, 0 or 1 for a less than, equal to or greater than b
 573  *
 574  * "Comparable to strcmp() at its best."
 575  */
 576 int utf32_cmp(const uint32_t *a, const uint32_t *b) {
 577   while(*a && *b && *a == *b) {
 578     ++a;
 579     ++b;
 580   }
 581   return *a < *b ? -1 : (*a > *b ? 1 : 0);
 582 }
 583
 584 /** @brief Return the General_Category value for @p c
 585  * @param Code point
 586  * @return General_Category property value
 587  */
 588 static inline enum unicode_General_Category utf32__general_category(uint32_t c) {
 589   return utf32__unidata(c)->general_category;
 590 }
 591
 592 /** @brief Determine Grapheme_Break property
 593  * @param c Code point
 594  * @return Grapheme_Break property value of @p c
 595  */
 596 static enum unicode_Grapheme_Break utf32__grapheme_break(uint32_t c) {
 597   return utf32__unidata(c)->grapheme_break;
 598 }
 599
 600 /** @brief Determine Word_Break property
 601  * @param c Code point
 602  * @return Word_Break property value of @p c
 603  */
 604 static enum unicode_Word_Break utf32__word_break(uint32_t c) {
 605   return utf32__unidata(c)->word_break;
 606 }
 607
 608 /** @brief Identify a grapheme cluster boundary
 609  * @param s Start of string (must be NFD)
 610  * @param ns Length of string
 611  * @param n Index within string (in [0,ns].)
 612  * @return 1 at a grapheme cluster boundary, 0 otherwise
 613  *
 614  * This function identifies default grapheme cluster boundaries as described in
 615  * UAX #29 s3.  It returns 1 if @p n points at the code point just after a
 616  * grapheme cluster boundary (including the hypothetical code point just after
 617  * the end of the string).
 618  */
 619 int utf32_is_grapheme_boundary(const uint32_t *s, size_t ns, size_t n) {
 620   uint32_t before, after;
 621   enum unicode_Grapheme_Break gbbefore, gbafter;
 622   /* GB1 and GB2 */
 623   if(n == 0 || n == ns)
 624     return 1;
 625   /* Now we know that s[n-1] and s[n] are safe to inspect */
 626   /* GB3 */
 627   before = s[n-1];
 628   after = s[n];
 629   if(before == 0x000D && after == 0x000A)
 630     return 0;
 631   gbbefore = utf32__grapheme_break(before);
 632   gbafter = utf32__grapheme_break(after);
 633   /* GB4 */
 634   if(gbbefore == unicode_Grapheme_Break_Control
 635      || before == 0x000D
 636      || before == 0x000A)
 637     return 1;
 638   /* GB5 */
 639   if(gbafter == unicode_Grapheme_Break_Control
 640      || after == 0x000D
 641      || after == 0x000A)
 642     return 1;
 643   /* GB6 */
 644   if(gbbefore == unicode_Grapheme_Break_L
 645      && (gbafter == unicode_Grapheme_Break_L
 646          || gbafter == unicode_Grapheme_Break_V
 647          || gbafter == unicode_Grapheme_Break_LV
 648          || gbafter == unicode_Grapheme_Break_LVT))
 649     return 0;
 650   /* GB7 */
 651   if((gbbefore == unicode_Grapheme_Break_LV
 652       || gbbefore == unicode_Grapheme_Break_V)
 653      && (gbafter == unicode_Grapheme_Break_V
 654          || gbafter == unicode_Grapheme_Break_T))
 655     return 0;
 656   /* GB8 */
 657   if((gbbefore == unicode_Grapheme_Break_LVT
 658       || gbbefore == unicode_Grapheme_Break_T)
 659      && gbafter == unicode_Grapheme_Break_T)
 660     return 0;
 661   /* GB9 */
 662   if(utf32__word_break(after) == unicode_Word_Break_Extend)
 663     return 0;
 664   /* GB10 */
 665   return 1;
 666 }
 667
 668 /** @brief Return true if @p c is ignorable for boundary specifications */
 669 static inline int utf32__boundary_ignorable(enum unicode_Word_Break wb) {
 670   return (wb == unicode_Word_Break_Extend
 671           || wb == unicode_Word_Break_Format);
 672 }
 673
 674 /** @brief Identify a word boundary
 675  * @param s Start of string (must be NFD)
 676  * @param ns Length of string
 677  * @param n Index within string (in [0,ns].)
 678  * @return 1 at a word boundary, 0 otherwise
 679  *
 680  * This function identifies default word boundaries as described in UAX #29 s4.
 681  * It returns 1 if @p n points at the code point just after a word boundary
 682  * (including the hypothetical code point just after the end of the string).
 683  */
 684 int utf32_is_word_boundary(const uint32_t *s, size_t ns, size_t n) {
 685   enum unicode_Word_Break twobefore, before, after, twoafter;
 686   size_t nn;
 687
 688   /* WB1 and WB2 */
 689   if(n == 0 || n == ns)
 690     return 1;
 691   /* WB3 */
 692   if(s[n-1] == 0x000D && s[n] == 0x000A)
 693     return 0;
 694   /* WB4 */
 695   /* (!Sep) x (Extend|Format) as in UAX #29 s6.2 */
 696   switch(s[n-1]) {                      /* bit of a bodge */
 697   case 0x000A:
 698   case 0x000D:
 699   case 0x0085:
 700   case 0x2028:
 701   case 0x2029:
 702     break;
 703   default:
 704     if(utf32__boundary_ignorable(utf32__word_break(s[n])))
 705       return 0;
 706     break;
 707   }
 708   /* Gather the property values we'll need for the rest of the test taking the
 709    * s6.2 changes into account */
 710   /* First we look at the code points after the proposed boundary */
 711   nn = n;                               /* <ns */
 712   after = utf32__word_break(s[nn++]);
 713   if(!utf32__boundary_ignorable(after)) {
 714     /* X (Extend|Format)* -> X */
 715     while(nn < ns && utf32__boundary_ignorable(utf32__word_break(s[nn])))
 716       ++nn;
 717   }
 718   /* It's possible now that nn=ns */
 719   if(nn < ns)
 720     twoafter = utf32__word_break(s[nn]);
 721   else
 722     twoafter = unicode_Word_Break_Other;
 723
 724   /* Next we look at the code points before the proposed boundary.  This is a
 725    * bit fiddlier. */
 726   nn = n;
 727   while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
 728     --nn;
 729   if(nn == 0) {
 730     /* s[nn] must be ignorable */
 731     before = utf32__word_break(s[nn]);
 732     twobefore = unicode_Word_Break_Other;
 733   } else {
 734     /* s[nn] is ignorable or after the proposed boundary; but s[nn-1] is not
 735      * ignorable. */
 736     before = utf32__word_break(s[nn - 1]);
 737     --nn;
 738     /* Repeat the exercise */
 739     while(nn > 0 && utf32__boundary_ignorable(utf32__word_break(s[nn - 1])))
 740       --nn;
 741     if(nn == 0)
 742       twobefore = utf32__word_break(s[nn]);
 743     else
 744       twobefore = utf32__word_break(s[nn - 1]);
 745   }
 746
 747   /* WB5 */
 748   if(before == unicode_Word_Break_ALetter
 749      && after == unicode_Word_Break_ALetter)
 750     return 0;
 751   /* WB6 */
 752   if(before == unicode_Word_Break_ALetter
 753      && after == unicode_Word_Break_MidLetter
 754      && twoafter == unicode_Word_Break_ALetter)
 755     return 0;
 756   /* WB7 */
 757   if(twobefore == unicode_Word_Break_ALetter
 758      && before == unicode_Word_Break_MidLetter
 759      && after == unicode_Word_Break_ALetter)
 760     return 0;
 761   /* WB8 */
 762   if(before == unicode_Word_Break_Numeric
 763      && after == unicode_Word_Break_Numeric)
 764     return 0;
 765   /* WB9 */
 766   if(before == unicode_Word_Break_ALetter
 767      && after == unicode_Word_Break_Numeric)
 768     return 0;
 769   /* WB10 */
 770   if(before == unicode_Word_Break_Numeric
 771      && after == unicode_Word_Break_ALetter)
 772     return 0;
 773    /* WB11 */
 774   if(twobefore == unicode_Word_Break_Numeric
 775      && before == unicode_Word_Break_MidNum
 776      && after == unicode_Word_Break_Numeric)
 777     return 0;
 778   /* WB12 */
 779   if(before == unicode_Word_Break_Numeric
 780      && after == unicode_Word_Break_MidNum
 781      && twoafter == unicode_Word_Break_Numeric)
 782     return 0;
 783   /* WB13 */
 784   if(before == unicode_Word_Break_Katakana
 785      && after == unicode_Word_Break_Katakana)
 786     return 0;
 787   /* WB13a */
 788   if((before == unicode_Word_Break_ALetter
 789       || before == unicode_Word_Break_Numeric
 790       || before == unicode_Word_Break_Katakana
 791       || before == unicode_Word_Break_ExtendNumLet)
 792      && after == unicode_Word_Break_ExtendNumLet)
 793     return 0;
 794   /* WB13b */
 795   if(before == unicode_Word_Break_ExtendNumLet
 796      && (after == unicode_Word_Break_ALetter
 797          || after == unicode_Word_Break_Numeric
 798          || after == unicode_Word_Break_Katakana))
 799     return 0;
 800   /* WB14 */
 801   return 1;
 802 }
 803
 804 /*@}*/
 805 /** @defgroup utf8 Functions that operate on UTF-8 strings */
 806 /*@{*/
 807
 808 /** @brief Wrapper to transform a UTF-8 string using the UTF-32 function */
 809 #define utf8__transform(FN) do {                                \
 810   uint32_t *to32 = 0, *decomp32 = 0;                            \
 811   size_t nto32, ndecomp32;                                      \
 812   char *decomp8 = 0;                                            \
 813                                                                 \
 814   if(!(to32 = utf8_to_utf32(s, ns, &nto32))) goto error;        \
 815   if(!(decomp32 = FN(to32, nto32, &ndecomp32))) goto error;     \
 816   decomp8 = utf32_to_utf8(decomp32, ndecomp32, ndp);            \
 817 error:                                                          \
 818   xfree(to32);                                                  \
 819   xfree(decomp32);                                              \
 820   return decomp8;                                               \
 821 } while(0)
 822
 823 /** @brief Canonically decompose @p [s,s+ns)
 824  * @param s Pointer to string
 825  * @param ns Length of string
 826  * @param ndp Where to store length of result
 827  * @return Pointer to result string, or NULL
 828  *
 829  * Computes the canonical decomposition of a string and stably sorts combining
 830  * characters into canonical order.  The result is in Normalization Form D and
 831  * (at the time of writing!) passes the NFD tests defined in Unicode 5.0's
 832  * NormalizationTest.txt.
 833  *
 834  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 835  * this might be.
 836  *
 837  * See also utf32_decompose_canon().
 838  */
 839 char *utf8_decompose_canon(const char *s, size_t ns, size_t *ndp) {
 840   utf8__transform(utf32_decompose_canon);
 841 }
 842
 843 /** @brief Compatibility decompose @p [s,s+ns)
 844  * @param s Pointer to string
 845  * @param ns Length of string
 846  * @param ndp Where to store length of result
 847  * @return Pointer to result string, or NULL
 848  *
 849  * Computes the compatibility decomposition of a string and stably sorts
 850  * combining characters into canonical order.  The result is in Normalization
 851  * Form KD and (at the time of writing!) passes the NFKD tests defined in
 852  * Unicode 5.0's NormalizationTest.txt.
 853  *
 854  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 855  * this might be.
 856  *
 857  * See also utf32_decompose_compat().
 858  */
 859 char *utf8_decompose_compat(const char *s, size_t ns, size_t *ndp) {
 860   utf8__transform(utf32_decompose_compat);
 861 }
 862
 863 /** @brief Case-fold @p [s,s+ns)
 864  * @param s Pointer to string
 865  * @param ns Length of string
 866  * @param ndp Where to store length of result
 867  * @return Pointer to result string, or NULL
 868  *
 869  * Case-fold the string at @p s according to full default case-folding rules
 870  * (s3.13).  The result will be in NFD.
 871  *
 872  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 873  * this might be.
 874  */
 875 char *utf8_casefold_canon(const char *s, size_t ns, size_t *ndp) {
 876   utf8__transform(utf32_casefold_canon);
 877 }
 878
 879 /** @brief Compatibility case-fold @p [s,s+ns)
 880  * @param s Pointer to string
 881  * @param ns Length of string
 882  * @param ndp Where to store length of result
 883  * @return Pointer to result string, or NULL
 884  *
 885  * Case-fold the string at @p s according to full default case-folding rules
 886  * (s3.13).  The result will be in NFKD.
 887  *
 888  * Returns NULL if the string is not valid; see utf8_to_utf32() for reasons why
 889  * this might be.
 890  */
 891 char *utf8_casefold_compat(const char *s, size_t ns, size_t *ndp) {
 892   utf8__transform(utf32_casefold_compat);
 893 }
 894
 895 /*@}*/
 896
 897 /*
 898 Local Variables:
 899 c-basic-offset:2
 900 comment-column:40
 901 fill-column:79
 902 indent-tabs-mode:nil
 903 End:
 904 */