+out("/* Automatically generated file, see scripts/make-unidata */\n",
+ "#include <config.h>\n",
+ "#include \"types.h\"\n",
+ "#include \"unidata.h\"\n");
+
+# Short aliases to keep .c file small
+
+out(map(sprintf("#define %s unicode_gc_%s\n", $_, $_), sort keys %cats));
+out(map(sprintf("#define GB%s unicode_Grapheme_Break_%s\n", $_, $_), sort keys %gbreak));
+out(map(sprintf("#define WB%s unicode_Word_Break_%s\n", $_, $_), sort keys %wbreak));
+out(map(sprintf("#define SB%s unicode_Sentence_Break_%s\n", $_, $_), sort keys %sbreak));
+
+# Names for *_Break properties
+out("const char *const unicode_Grapheme_Break_names[] = {\n",
+ join(",\n",
+ map(" \"$_\"", sort keys %gbreak)),
+ "\n};\n");
+out("const char *const unicode_Word_Break_names[] = {\n",
+ join(",\n",
+ map(" \"$_\"", sort keys %wbreak)),
+ "\n};\n");
+out("const char *const unicode_Sentence_Break_names[] = {\n",
+ join(",\n",
+ map(" \"$_\"", sort keys %sbreak)),
+ "\n};\n");
+
+# Generate the decomposition mapping tables. We look out for duplicates
+# in order to save space and report this as decompsaved at the end. In
+# Unicode 5.0.0 this saves 1795 entries, which is at least 14Kbytes.
+my $decompnum = 0;
+my %decompnums = ();
+my $decompsaved = 0;
+out("static const uint32_t ");
+for(my $c = 0; $c <= $max; ++$c) {
+ # If canon is set then compat will be too and will be identical.
+ # If compat is set the canon might be clear. So we use the
+ # compat version and fix up the symbols after.
+ if(exists $data{$c}->{compat}) {
+ my $s = join(",",
+ (map(hex($_), split(/\s+/, $data{$c}->{compat})), 0));
+ if(!exists $decompnums{$s}) {
+ out(",\n") if $decompnum != 0;
+ out("cd$decompnum\[]={$s}");
+ $decompnums{$s} = $decompnum++;
+ } else {
+ ++$decompsaved;
+ }
+ $data{$c}->{compatsym} = "cd$decompnums{$s}";
+ if(exists $data{$c}->{canon}) {
+ $data{$c}->{canonsym} = "cd$decompnums{$s}";
+ }
+ }
+}
+out(";\n");
+
+# ...and the case folding table. Again we compress equal entries to save
+# space. In Unicode 5.0.0 this saves 51 entries or at least 408 bytes.
+# This doesns't seem as worthwhile as the decomposition mapping saving above.
+my $cfnum = 0;
+my %cfnums = ();
+my $cfsaved = 0;
+out("static const uint32_t ");
+for(my $c = 0; $c <= $max; ++$c) {
+ if(exists $data{$c}->{casefold}) {
+ my $s = join(",",
+ (map(hex($_), split(/\s+/, $data{$c}->{casefold})), 0));
+ if(!exists $cfnums{$s}) {
+ out(",\n") if $cfnum != 0;
+ out("cf$cfnum\[]={$s}");
+ $cfnums{$s} = $cfnum++;
+ } else {
+ ++$cfsaved;
+ }
+ $data{$c}->{cfsym} = "cf$cfnums{$s}";
+ }
+}
+out(";\n");
+
+# Visit all the $modulus-character blocks in turn and generate the
+# required subtables. As above we spot duplicates to save space. In
+# Unicode 5.0.0 with $modulus=128 and current table data this saves
+# 1372 subtables or at least three and a half megabytes on 32-bit
+# platforms.