A better solution to the problem of duplicated positions in

author simon <simon@cda61777-01e9-0310-a592-d414129be87e>

Thu, 2 Jan 2003 16:56:29 +0000 (16:56 +0000)

committer simon <simon@cda61777-01e9-0310-a592-d414129be87e>

Thu, 2 Jan 2003 16:56:29 +0000 (16:56 +0000)
author simon <simon@cda61777-01e9-0310-a592-d414129be87e>
Thu, 2 Jan 2003 16:56:29 +0000 (16:56 +0000)
committer simon <simon@cda61777-01e9-0310-a592-d414129be87e>
Thu, 2 Jan 2003 16:56:29 +0000 (16:56 +0000)
diff --git a/charset/sbcs.dat b/charset/sbcs.dat

index 7de51c9..a1096d4 100644 (file)
--- a/charset/sbcs.dat
+++ b/charset/sbcs.dat
@@ -306,13 +306,15 @@ charset CS_ISO8859_16
    appear from positions 0x5F to 0x7E inclusive. Here is the modified
    ISO8859-1 code table.
  
-  Note that position 0 is still 0000, not 0020 as it might plausibly
-  be, because I didn't like the idea that converting several words
-  in Unicode through this table would produce NULs in place of all
-  the spaces! In principle that works fine, but it makes me uneasy.
+  Since this table contains a few duplicated positions, we use the
+  `sortpriority' hint to indicate that things in the main part of
+  the code table (0x20-0xFF) should be generated preferentially when
+  converting _from_ Unicode. Hence, U+00b0 (for example) will yield
+  0xb0 rather than 0x07.
  
  charset CS_ISO8859_1_X11
-0000 2666 2592 2409 240c 240d 240a 00b0 00b1 2424 240b 2518 2510 250c 2514 253c
+sortpriority 00-1F -1
+0020 2666 2592 2409 240c 240d 240a 00b0 00b1 2424 240b 2518 2510 250c 2514 253c
  23ba 23bb 2500 23bc 23bd 251c 2524 2534 252c 2502 2264 2265 03c0 2260 00a3 00b7
  0020 0021 0022 0023 0024 0025 0026 0027 0028 0029 002a 002b 002c 002d 002e 002f
  0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 003a 003b 003c 003d 003e 003f
diff --git a/charset/sbcsgen.pl b/charset/sbcsgen.pl

index 0d6ebc9..355bf3b 100644 (file)
--- a/charset/sbcsgen.pl
+++ b/charset/sbcsgen.pl
@@ -27,21 +27,28 @@ my $charsetname = undef;
  my @vals = ();
  
  my @charsetnames = ();
+my @sortpriority = ();
  
  while (<FOO>) {
      chomp;
      if (/^charset (.*)$/) {
         $charsetname = $1;
         @vals = ();
+       @sortpriority = map { 0 } 0..255;
+    } elsif (/^sortpriority ([^-]*)-([^-]*) (.*)$/) {
+       for ($i = hex $1; $i <= hex $2; $i++) {
+           $sortpriority[$i] += $3;
+       }
      } elsif (/^[0-9a-fA-FX]/) {
         push @vals, map { $_ eq "XXXX" ? -1 : hex $_ } split / +/, $_;
         if (scalar @vals > 256) {
             die "$infile:$.: charset $charsetname has more than 256 values\n";
         } elsif (scalar @vals == 256) {
-           &outcharset($charsetname, @vals);
+           &outcharset($charsetname, \@vals, \@sortpriority);
             push @charsetnames, $charsetname;
             $charsetname = undef;
             @vals = ();
+           @sortpriority = map { 0 } 0..255;
         }
      }
  }
@@ -56,8 +63,8 @@ foreach $i (@charsetnames) {
  print "\n";
  print "#endif /* ENUM_CHARSETS */\n";
  
-sub outcharset($@) {
-    my ($name, @vals) = @_;
+sub outcharset($$$) {
+    my ($name, $vals, $sortpriority) = @_;
      my ($prefix, $i, @sorted);
  
      print "static const sbcs_data data_$name = {\n";
@@ -65,11 +72,12 @@ sub outcharset($@) {
      $prefix = "    ";
      @sorted = ();
      for ($i = 0; $i < 256; $i++) {
-       if ($vals[$i] < 0) {
+       if ($vals->[$i] < 0) {
             printf "%sERROR ", $prefix;
         } else {
-           printf "%s0x%04x", $prefix, $vals[$i];
-           push @sorted, [$i, $vals[$i]];
+           printf "%s0x%04x", $prefix, $vals->[$i];
+           die "ooh? $i\n" unless defined $sortpriority->[$i];
+           push @sorted, [$i, $vals->[$i], 0+$sortpriority->[$i]];
         }
         if ($i % 8 == 7) {
             $prefix = ",\n    ";
@@ -78,15 +86,21 @@ sub outcharset($@) {
         }
      }
      print "\n    },\n    {\n";
-    @sorted = sort { $a->[1] <=> $b->[1] } @sorted;
+    @sorted = sort { $a->[1] == $b->[1] ?
+                    $b->[2] <=> $a->[2] :
+                    $a->[1] <=> $b->[1] } @sorted;
      $prefix = "    ";
-    for ($i = 0; $i < scalar @sorted; $i++) {
+    $uval = -1;
+    for ($i = $j = 0; $i < scalar @sorted; $i++) {
+       next if ($uval == $sorted[$i]->[1]); # low-priority alternative
+       $uval = $sorted[$i]->[1];
         printf "%s0x%02x", $prefix, $sorted[$i]->[0];
-       if ($i % 8 == 7) {
+       if ($j % 8 == 7) {
             $prefix = ",\n    ";
         } else {
             $prefix = ", ";
         }
+       $j++;
      }
      printf "\n    },\n    %d\n", scalar @sorted;
      print "};\n";
author	simon <simon@cda61777-01e9-0310-a592-d414129be87e>
	Thu, 2 Jan 2003 16:56:29 +0000 (16:56 +0000)
committer	simon <simon@cda61777-01e9-0310-a592-d414129be87e>
	Thu, 2 Jan 2003 16:56:29 +0000 (16:56 +0000)
charset/sbcs.dat		patch \| blob \| blame \| history
charset/sbcsgen.pl		patch \| blob \| blame \| history