X-Git-Url: https://git.distorted.org.uk/~mdw/disorder/blobdiff_plain/1a05e381782c0c3135a48cc35dd1e58c2a5d0c25..0e843521df080e255a855565e210b7e0caa64511:/scripts/make-unidata

diff --git a/scripts/make-unidata b/scripts/make-unidata
index 259e114..009ae19 100755
--- a/scripts/make-unidata
+++ b/scripts/make-unidata
@@ -31,7 +31,6 @@
 #  - SpecialCasing.txt data for case mapping
 #  - Title case offsets
 #  - Some kind of hinting for composition
-#  - Word boundary support
 #  - ...
 #
 # NB the generated files DO NOT offer a stable ABI and so are not immediately
@@ -131,27 +130,27 @@ while(<>) {
     $maxud = $ud if $ud > $maxud;
     $minld = $ld if $ld < $minld;
     $maxld = $ld if $ld > $maxld;
-    my $d = {
-	"gc" => $gc,
-	"ccc" => $ccc,
-	"ud" => $ud,
-	"ld" => $ld,
-    };
-    if($dm ne '') {
-	if($dm !~ /</) {
-	    # This is a canonical decomposition
-	    $d->{canon} = $dm;
-	    $d->{compat} = $dm;
-	} else {
-	    # This is only a compatibility decomposition
-	    $dm =~ s/^<.*>\s*//;
-	    $d->{compat} = $dm;
-	}
-    }
     if($start != $end) {
-	printf STDERR "> range %04X-%04X is %s\n", $start, $end, $d->{gc};
+	printf STDERR "> range %04X-%04X is %s\n", $start, $end, $gc;
     }
     for($c = $start; $c <= $end; ++$c) {
+	my $d = {
+	    "gc" => $gc,
+	    "ccc" => $ccc,
+	    "ud" => $ud,
+	    "ld" => $ld,
+	};
+	if($dm ne '') {
+	    if($dm !~ /</) {
+		# This is a canonical decomposition
+		$d->{canon} = $dm;
+		$d->{compat} = $dm;
+	    } else {
+		# This is only a compatibility decomposition
+		$dm =~ s/^<.*>\s*//;
+		$d->{compat} = $dm;
+	    }
+	}
 	$data{$c} = $d;
     }
     $cats{$gc} = 1;
@@ -169,22 +168,17 @@ sub read_prop_with_ranges {
 	my ($range, $propval) = split(/\s*;\s*/, $_);
 	if($range =~ /(.*)\.\.(.*)/) {
 	    for my $c (hex($1) .. hex($2)) {
-		if(exists $data{$c}) {
-		    $data{$c}->{$propkey} = $propval;
-		}
+		die "($range)\n" if($c == 0xAC00 and $propkey eq 'gbreak');
+		$data{$c}->{$propkey} = $propval;
 	    }
 	} else {
 	    my $c = hex($range);
-	    if(exists $data{$c}) {
-		$data{$c}->{$propkey} = $propval;
-	    }
+	    $data{$c}->{$propkey} = $propval;
 	}
     }
 }
 
 # Grapheme_Break etc
-# NB we do this BEFORE filling in blanks so that the Hangul characters
-# don't get filled in; we can compute their properties mechanically.
 read_prop_with_ranges("auxiliary/GraphemeBreakProperty.txt", "gbreak");
 read_prop_with_ranges("auxiliary/WordBreakProperty.txt", "wbreak");
 read_prop_with_ranges("auxiliary/SentenceBreakProperty.txt", "sbreak");
@@ -507,6 +501,7 @@ for(my $base = 0; $base <= $max; $base += $modulus) {
     }
     my $t = join(",\n", @t);
     if(!exists $subtable{$t}) {
+	out(sprintf("/* %04X-%04X */\n", $base, $base + $modulus - 1));
 	out("static const struct unidata st$subtablecounter\[] = {\n",
 	    "$t\n",
 	    "};\n");