| 1 | #! /usr/bin/perl -w |
| 2 | # |
| 3 | # This file is part of DisOrder. |
| 4 | # Copyright (C) 2007 Richard Kettlewell |
| 5 | # |
| 6 | # This program is free software; you can redistribute it and/or modify |
| 7 | # it under the terms of the GNU General Public License as published by |
| 8 | # the Free Software Foundation; either version 2 of the License, or |
| 9 | # (at your option) any later version. |
| 10 | # |
| 11 | # This program is distributed in the hope that it will be useful, but |
| 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | # General Public License for more details. |
| 15 | # |
| 16 | # You should have received a copy of the GNU General Public License |
| 17 | # along with this program; if not, write to the Free Software |
| 18 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
| 19 | # USA |
| 20 | # |
| 21 | # |
| 22 | # Generate Unicode support tables |
| 23 | # |
| 24 | # This script will download data from unicode.org if the required files |
| 25 | # aren't in the current directory. |
| 26 | # |
| 27 | # After modifying this script you should run: |
| 28 | # make -C lib rebuild-unicode check |
| 29 | # |
| 30 | # Things not supported yet: |
| 31 | # - SpecialCasing.txt data for case mapping |
| 32 | # - Title case offsets |
| 33 | # - Some kind of hinting for composition |
| 34 | # - Word boundary support |
| 35 | # - ... |
| 36 | # |
| 37 | # NB the generated files DO NOT offer a stable ABI and so are not immediately |
| 38 | # suitable for use in a general-purpose library. Things that would need to |
| 39 | # be done: |
| 40 | # - Hide unidata.h from applications; it will never be ABI- or even API-stable. |
| 41 | # - Stablized General_Category values |
| 42 | # - Extend the unicode.h API to general utility rather than just what |
| 43 | # DisOrder needs. |
| 44 | # - ... |
| 45 | # |
| 46 | use strict; |
| 47 | use File::Basename; |
| 48 | |
| 49 | sub out { |
| 50 | print @_ or die "$!\n"; |
| 51 | } |
| 52 | |
| 53 | sub key { |
| 54 | my $d = shift; |
| 55 | local $_; |
| 56 | |
| 57 | return join("-", map($d->{$_}, sort keys %$d)); |
| 58 | } |
| 59 | |
| 60 | # Size of a subtable |
| 61 | # |
| 62 | # This can be varied to trade off the number of subtables against their size. |
| 63 | our $modulus = 128; |
| 64 | |
| 65 | my %cats = (); # known general categories |
| 66 | my %data = (); # mapping of codepoints to information |
| 67 | my $max = 0; # maximum codepoint |
| 68 | my $maxccc = 0; # maximum combining class |
| 69 | my $maxud = 0; |
| 70 | my $minud = 0; # max/min upper case offset |
| 71 | my $maxld = 0; |
| 72 | my $minld = 0; # max/min lower case offset |
| 73 | |
| 74 | # Make sure we have our desired input files. We explicitly specify a |
| 75 | # Unicode standard version to make sure that a given version of DisOrder |
| 76 | # supports a given version of Unicode. |
| 77 | sub input { |
| 78 | my $path = shift; |
| 79 | my $lpath = basename($path); |
| 80 | if(!-e $lpath) { |
| 81 | system("wget http://www.unicode.org/Public/5.0.0/ucd/$path"); |
| 82 | chmod(0444, $lpath) or die "$lpath: $!\n"; |
| 83 | } |
| 84 | open(STDIN, "<$lpath") or die "$lpath: $!\n"; |
| 85 | } |
| 86 | |
| 87 | |
| 88 | # Read the main data file |
| 89 | input("UnicodeData.txt"); |
| 90 | while(<>) { |
| 91 | my @f = split(/;/, $_); |
| 92 | my $c = hex($f[0]); # codepoint |
| 93 | next if $c >= 0xE0000; # ignore various high-numbered stuff |
| 94 | # TODO justify this exclusion! |
| 95 | my $name = $f[1]; |
| 96 | my $gc = $f[2]; # General_Category |
| 97 | my $ccc = $f[3]; # Canonical_Combining_Class |
| 98 | my $dm = $f[5]; # Decomposition_Type + Decomposition_Mapping |
| 99 | my $sum = hex($f[12]) || $c; # Simple_Uppercase_Mapping |
| 100 | my $slm = hex($f[13]) || $c; # Simple_Lowercase_Mapping |
| 101 | # recalculate the upper/lower case mappings as offsets |
| 102 | my $ud = $sum - $c; |
| 103 | my $ld = $slm - $c; |
| 104 | # update bounds on various values |
| 105 | $maxccc = $ccc if $ccc > $maxccc; # assumed never to be -ve |
| 106 | $minud = $ud if $ud < $minud; |
| 107 | $maxud = $ud if $ud > $maxud; |
| 108 | $minld = $ld if $ld < $minld; |
| 109 | $maxld = $ld if $ld > $maxld; |
| 110 | $data{$c} = { |
| 111 | "gc" => $gc, |
| 112 | "ccc" => $ccc, |
| 113 | "ud" => $ud, |
| 114 | "ld" => $ld, |
| 115 | }; |
| 116 | if($dm ne '') { |
| 117 | if($dm !~ /</) { |
| 118 | # This is a canonical decomposition |
| 119 | $data{$c}->{canon} = $dm; |
| 120 | $data{$c}->{compat} = $dm; |
| 121 | } else { |
| 122 | # This is only a compatibility decomposition |
| 123 | $dm =~ s/^<.*>\s*//; |
| 124 | $data{$c}->{compat} = $dm; |
| 125 | } |
| 126 | } |
| 127 | $cats{$gc} = 1; |
| 128 | $max = $c if $c > $max; |
| 129 | } |
| 130 | |
| 131 | sub read_prop_with_ranges { |
| 132 | my $path = shift; |
| 133 | my $propkey = shift; |
| 134 | input($path); |
| 135 | while(<>) { |
| 136 | chomp; |
| 137 | s/\s*\#.*//; |
| 138 | next if $_ eq ''; |
| 139 | my ($range, $propval) = split(/\s*;\s*/, $_); |
| 140 | if($range =~ /(.*)\.\.(.*)/) { |
| 141 | for my $c (hex($1) .. hex($2)) { |
| 142 | if(exists $data{$c}) { |
| 143 | $data{$c}->{$propkey} = $propval; |
| 144 | } |
| 145 | } |
| 146 | } else { |
| 147 | my $c = hex($range); |
| 148 | if(exists $data{$c}) { |
| 149 | $data{$c}->{$propkey} = $propval; |
| 150 | } |
| 151 | } |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | # Grapheme_Break etc |
| 156 | # NB we do this BEFORE filling in blanks so that the Hangul characters |
| 157 | # don't get filled in; we can compute their properties mechanically. |
| 158 | read_prop_with_ranges("auxiliary/GraphemeBreakProperty.txt", "gbreak"); |
| 159 | read_prop_with_ranges("auxiliary/WordBreakProperty.txt", "wbreak"); |
| 160 | read_prop_with_ranges("auxiliary/SentenceBreakProperty.txt", "sbreak"); |
| 161 | |
| 162 | # Compute the full list and fill in the Extend category properly |
| 163 | my %gbreak = (); |
| 164 | my %wbreak = (); |
| 165 | my %sbreak = (); |
| 166 | for my $c (keys %data) { |
| 167 | if(!exists $data{$c}->{gbreak}) { |
| 168 | $data{$c}->{gbreak} = 'Other'; |
| 169 | } |
| 170 | $gbreak{$data{$c}->{gbreak}} = 1; |
| 171 | |
| 172 | if(!exists $data{$c}->{wbreak}) { |
| 173 | if($data{$c}->{gbreak} eq 'Extend') { |
| 174 | $data{$c}->{wbreak} = 'Extend'; |
| 175 | } else { |
| 176 | $data{$c}->{wbreak} = 'Other'; |
| 177 | } |
| 178 | } |
| 179 | $wbreak{$data{$c}->{wbreak}} = 1; |
| 180 | |
| 181 | if(!exists $data{$c}->{sbreak}) { |
| 182 | if($data{$c}->{gbreak} eq 'Extend') { |
| 183 | $data{$c}->{sbreak} = 'Extend'; |
| 184 | } else { |
| 185 | $data{$c}->{sbreak} = 'Other'; |
| 186 | } |
| 187 | } |
| 188 | $sbreak{$data{$c}->{sbreak}} = 1; |
| 189 | } |
| 190 | |
| 191 | # Round up the maximum value to a whole number of subtables |
| 192 | $max += ($modulus - 1) - ($max % $modulus); |
| 193 | |
| 194 | # Make sure there are no gaps |
| 195 | for(my $c = 0; $c <= $max; ++$c) { |
| 196 | if(!exists $data{$c}) { |
| 197 | $data{$c} = { |
| 198 | "gc" => "Cn", # not assigned |
| 199 | "ccc" => 0, |
| 200 | "ud" => 0, |
| 201 | "ld" => 0, |
| 202 | "wbreak" => 'Other', |
| 203 | "gbreak" => 'Other', |
| 204 | "sbreak" => 'Other', |
| 205 | }; |
| 206 | } |
| 207 | } |
| 208 | $cats{'Cn'} = 1; |
| 209 | |
| 210 | # Read the casefolding data too |
| 211 | input("CaseFolding.txt"); |
| 212 | while(<>) { |
| 213 | chomp; |
| 214 | next if /^\#/ or $_ eq ''; |
| 215 | my @f = split(/\s*;\s*/, $_); |
| 216 | # Full case folding means use status C and F. |
| 217 | # We discard status T, Turkish users may wish to change this. |
| 218 | if($f[1] eq 'C' or $f[1] eq 'F') { |
| 219 | my $c = hex($f[0]); |
| 220 | $data{$c}->{casefold} = $f[2]; |
| 221 | # We are particularly interest in combining characters that |
| 222 | # case-fold to non-combining characters, or characters that |
| 223 | # case-fold to sequences with combining characters in non-initial |
| 224 | # positions, as these required decomposiiton before case-folding |
| 225 | my @d = map(hex($_), split(/\s+/, $data{$c}->{casefold})); |
| 226 | if($data{$c}->{ccc} != 0) { |
| 227 | # This is a combining character |
| 228 | if($data{$d[0]}->{ccc} == 0) { |
| 229 | # The first character of its case-folded form is NOT |
| 230 | # a combining character. The field name is the example |
| 231 | # explicitly mentioned in the spec. |
| 232 | $data{$c}->{ypogegrammeni} = 1; |
| 233 | } |
| 234 | } else { |
| 235 | # This is a non-combining character; inspect the non-initial |
| 236 | # code points of the case-folded sequence |
| 237 | shift(@d); |
| 238 | if(grep($data{$_}->{ccc} != 0, @d)) { |
| 239 | # Some non-initial code point in the case-folded for is NOT a |
| 240 | # a combining character. |
| 241 | $data{$c}->{ypogegrammeni} = 1; |
| 242 | } |
| 243 | } |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | # Generate the header file |
| 248 | open(STDOUT, ">unidata.h") or die "unidata.h: $!\n"; |
| 249 | |
| 250 | out("/* Automatically generated file, see scripts/make-unidata */\n", |
| 251 | "#ifndef UNIDATA_H\n", |
| 252 | "#define UNIDATA_H\n"); |
| 253 | |
| 254 | # TODO choose stable values for General_Category |
| 255 | out("enum unicode_gc_cat {\n", |
| 256 | join(",\n", |
| 257 | map(" unicode_gc_$_", sort keys %cats)), "\n};\n"); |
| 258 | |
| 259 | out("enum unicode_Grapheme_Break {\n", |
| 260 | join(",\n", |
| 261 | map(" unicode_Grapheme_Break_$_", sort keys %gbreak)), |
| 262 | "\n};\n"); |
| 263 | out("extern const char *const unicode_Grapheme_Break_names[];\n"); |
| 264 | |
| 265 | out("enum unicode_Word_Break {\n", |
| 266 | join(",\n", |
| 267 | map(" unicode_Word_Break_$_", sort keys %wbreak)), |
| 268 | "\n};\n"); |
| 269 | out("extern const char *const unicode_Word_Break_names[];\n"); |
| 270 | |
| 271 | out("enum unicode_Sentence_Break {\n", |
| 272 | join(",\n", |
| 273 | map(" unicode_Sentence_Break_$_", sort keys %sbreak)), |
| 274 | "\n};\n"); |
| 275 | out("extern const char *const unicode_Sentence_Break_names[];\n"); |
| 276 | |
| 277 | out("enum unicode_flags {\n", |
| 278 | " unicode_normalize_before_casefold = 1\n", |
| 279 | "};\n", |
| 280 | "\n"); |
| 281 | |
| 282 | # Choose the narrowest type that will fit the required values |
| 283 | sub choosetype { |
| 284 | my ($min, $max) = @_; |
| 285 | if($min >= 0) { |
| 286 | return "char" if $max <= 127; |
| 287 | return "unsigned char" if $max <= 255; |
| 288 | return "int16_t" if $max < 32767; |
| 289 | return "uint16_t" if $max < 65535; |
| 290 | return "int32_t"; |
| 291 | } else { |
| 292 | return "char" if $min >= -127 && $max <= 127; |
| 293 | return "int16_t" if $min >= -32767 && $max <= 32767; |
| 294 | return "int32_t"; |
| 295 | } |
| 296 | } |
| 297 | |
| 298 | out("struct unidata {\n", |
| 299 | " const uint32_t *compat;\n", |
| 300 | " const uint32_t *canon;\n", |
| 301 | " const uint32_t *casefold;\n", |
| 302 | " ".choosetype($minud, $maxud)." upper_offset;\n", |
| 303 | " ".choosetype($minld, $maxld)." lower_offset;\n", |
| 304 | " ".choosetype(0, $maxccc)." ccc;\n", |
| 305 | " char gc;\n", |
| 306 | " uint8_t flags;\n", |
| 307 | " char grapheme_break;\n", |
| 308 | " char word_break;\n", |
| 309 | " char sentence_break;\n", |
| 310 | "};\n"); |
| 311 | # compat, canon and casefold do have have non-BMP characters, so we |
| 312 | # can't use a simple 16-bit table. We could use UTF-8 or UTF-16 |
| 313 | # though, saving a bit of space (probably not that much...) at the |
| 314 | # cost of marginally reduced performance and additional complexity |
| 315 | |
| 316 | out("extern const struct unidata *const unidata[];\n"); |
| 317 | |
| 318 | out("#define UNICODE_NCHARS ", ($max + 1), "\n"); |
| 319 | out("#define UNICODE_MODULUS $modulus\n"); |
| 320 | |
| 321 | out("#endif\n"); |
| 322 | |
| 323 | close STDOUT or die "unidata.h: $!\n"; |
| 324 | |
| 325 | open(STDOUT, ">unidata.c") or die "unidata.c: $!\n"; |
| 326 | |
| 327 | out("/* Automatically generated file, see scripts/make-unidata */\n", |
| 328 | "#include <config.h>\n", |
| 329 | "#include \"types.h\"\n", |
| 330 | "#include \"unidata.h\"\n"); |
| 331 | |
| 332 | # Short aliases to keep .c file small |
| 333 | |
| 334 | out(map(sprintf("#define %s unicode_gc_%s\n", $_, $_), sort keys %cats)); |
| 335 | out(map(sprintf("#define GB%s unicode_Grapheme_Break_%s\n", $_, $_), sort keys %gbreak)); |
| 336 | out(map(sprintf("#define WB%s unicode_Word_Break_%s\n", $_, $_), sort keys %wbreak)); |
| 337 | out(map(sprintf("#define SB%s unicode_Sentence_Break_%s\n", $_, $_), sort keys %sbreak)); |
| 338 | |
| 339 | # Names for *_Break properties |
| 340 | out("const char *const unicode_Grapheme_Break_names[] = {\n", |
| 341 | join(",\n", |
| 342 | map(" \"$_\"", sort keys %gbreak)), |
| 343 | "\n};\n"); |
| 344 | out("const char *const unicode_Word_Break_names[] = {\n", |
| 345 | join(",\n", |
| 346 | map(" \"$_\"", sort keys %wbreak)), |
| 347 | "\n};\n"); |
| 348 | out("const char *const unicode_Sentence_Break_names[] = {\n", |
| 349 | join(",\n", |
| 350 | map(" \"$_\"", sort keys %sbreak)), |
| 351 | "\n};\n"); |
| 352 | |
| 353 | # Generate the decomposition mapping tables. We look out for duplicates |
| 354 | # in order to save space and report this as decompsaved at the end. In |
| 355 | # Unicode 5.0.0 this saves 1795 entries, which is at least 14Kbytes. |
| 356 | my $decompnum = 0; |
| 357 | my %decompnums = (); |
| 358 | my $decompsaved = 0; |
| 359 | out("static const uint32_t "); |
| 360 | for(my $c = 0; $c <= $max; ++$c) { |
| 361 | # If canon is set then compat will be too and will be identical. |
| 362 | # If compat is set the canon might be clear. So we use the |
| 363 | # compat version and fix up the symbols after. |
| 364 | if(exists $data{$c}->{compat}) { |
| 365 | my $s = join(",", |
| 366 | (map(hex($_), split(/\s+/, $data{$c}->{compat})), 0)); |
| 367 | if(!exists $decompnums{$s}) { |
| 368 | out(",\n") if $decompnum != 0; |
| 369 | out("cd$decompnum\[]={$s}"); |
| 370 | $decompnums{$s} = $decompnum++; |
| 371 | } else { |
| 372 | ++$decompsaved; |
| 373 | } |
| 374 | $data{$c}->{compatsym} = "cd$decompnums{$s}"; |
| 375 | if(exists $data{$c}->{canon}) { |
| 376 | $data{$c}->{canonsym} = "cd$decompnums{$s}"; |
| 377 | } |
| 378 | } |
| 379 | } |
| 380 | out(";\n"); |
| 381 | |
| 382 | # ...and the case folding table. Again we compress equal entries to save |
| 383 | # space. In Unicode 5.0.0 this saves 51 entries or at least 408 bytes. |
| 384 | # This doesns't seem as worthwhile as the decomposition mapping saving above. |
| 385 | my $cfnum = 0; |
| 386 | my %cfnums = (); |
| 387 | my $cfsaved = 0; |
| 388 | out("static const uint32_t "); |
| 389 | for(my $c = 0; $c <= $max; ++$c) { |
| 390 | if(exists $data{$c}->{casefold}) { |
| 391 | my $s = join(",", |
| 392 | (map(hex($_), split(/\s+/, $data{$c}->{casefold})), 0)); |
| 393 | if(!exists $cfnums{$s}) { |
| 394 | out(",\n") if $cfnum != 0; |
| 395 | out("cf$cfnum\[]={$s}"); |
| 396 | $cfnums{$s} = $cfnum++; |
| 397 | } else { |
| 398 | ++$cfsaved; |
| 399 | } |
| 400 | $data{$c}->{cfsym} = "cf$cfnums{$s}"; |
| 401 | } |
| 402 | } |
| 403 | out(";\n"); |
| 404 | |
| 405 | # Visit all the $modulus-character blocks in turn and generate the |
| 406 | # required subtables. As above we spot duplicates to save space. In |
| 407 | # Unicode 5.0.0 with $modulus=128 and current table data this saves |
| 408 | # 1372 subtables or at least three and a half megabytes on 32-bit |
| 409 | # platforms. |
| 410 | |
| 411 | my %subtable = (); # base->subtable number |
| 412 | my %subtableno = (); # subtable number -> content |
| 413 | my $subtablecounter = 0; # counter for subtable numbers |
| 414 | my $subtablessaved = 0; # number of tables saved |
| 415 | for(my $base = 0; $base <= $max; $base += $modulus) { |
| 416 | my @t; |
| 417 | for(my $c = $base; $c < $base + $modulus; ++$c) { |
| 418 | my $d = $data{$c}; |
| 419 | my $canonsym = ($data{$c}->{canonsym} or "0"); |
| 420 | my $compatsym = ($data{$c}->{compatsym} or "0"); |
| 421 | my $cfsym = ($data{$c}->{cfsym} or "0"); |
| 422 | my @flags = (); |
| 423 | if($data{$c}->{ypogegrammeni}) { |
| 424 | push(@flags, "unicode_normalize_before_casefold"); |
| 425 | } |
| 426 | my $flags = @flags ? join("|", @flags) : 0; |
| 427 | push(@t, "{". |
| 428 | join(",", |
| 429 | $compatsym, |
| 430 | $canonsym, |
| 431 | $cfsym, |
| 432 | $d->{ud}, |
| 433 | $d->{ld}, |
| 434 | $d->{ccc}, |
| 435 | $d->{gc}, |
| 436 | $flags, |
| 437 | "GB$d->{gbreak}", |
| 438 | "WB$d->{wbreak}", |
| 439 | "SB$d->{sbreak}", |
| 440 | )."}"); |
| 441 | } |
| 442 | my $t = join(",\n", @t); |
| 443 | if(!exists $subtable{$t}) { |
| 444 | out("static const struct unidata st$subtablecounter\[] = {\n", |
| 445 | "$t\n", |
| 446 | "};\n"); |
| 447 | $subtable{$t} = $subtablecounter++; |
| 448 | } else { |
| 449 | ++$subtablessaved; |
| 450 | } |
| 451 | $subtableno{$base} = $subtable{$t}; |
| 452 | } |
| 453 | |
| 454 | out("const struct unidata*const unidata[]={\n"); |
| 455 | for(my $base = 0; $base <= $max; $base += $modulus) { |
| 456 | out("st$subtableno{$base},\n"); |
| 457 | } |
| 458 | out("};\n"); |
| 459 | |
| 460 | close STDOUT or die "unidata.c: $!\n"; |
| 461 | |
| 462 | print STDERR "max=$max, subtables=$subtablecounter, subtablessaved=$subtablessaved\n"; |
| 463 | print STDERR "decompsaved=$decompsaved cfsaved=$cfsaved\n"; |