X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..d5d484b0fbe924d3663b177965538d517ee412c1:/icuSources/tools/genpname/preparse.pl?ds=sidebyside

diff --git a/icuSources/tools/genpname/preparse.pl b/icuSources/tools/genpname/preparse.pl
index e7532ae3..3e6c9704 100644
--- a/icuSources/tools/genpname/preparse.pl
+++ b/icuSources/tools/genpname/preparse.pl
@@ -1,7 +1,7 @@
 #!/bin/perl -w
 #*******************************************************************
 # COPYRIGHT:
-# Copyright (c) 2002-2003, International Business Machines Corporation and
+# Copyright (c) 2002-2006, International Business Machines Corporation and
 # others. All Rights Reserved.
 #*******************************************************************
 
@@ -53,7 +53,7 @@ use FileHandle;
 use strict;
 use Dumpvalue;
 
-my $DEBUG = 0;
+my $DEBUG = 1;
 my $DUMPER = new Dumpvalue;
 
 my $count = @ARGV;
@@ -65,6 +65,10 @@ my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
 # Get the current year from the system
 my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
 
+# Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique
+my $propNA = 0;
+my $valueNA = 0;
+
 #----------------------------------------------------------------------
 # Top level property keys for binary, enumerated, string, and double props
 my @TOP     = qw( _bp _ep _sp _dp _mp );
@@ -94,10 +98,6 @@ my %UNSUPPORTED = (Composition_Exclusion => 1,
                    Expands_On_NFKD => 1,
                    FC_NFKC_Closure => 1,
                    ID_Start_Exceptions => 1,
-                   NFC_Quick_Check => 1,
-                   NFD_Quick_Check => 1,
-                   NFKC_Quick_Check => 1,
-                   NFKD_Quick_Check => 1,
                    Special_Case_Condition => 1,
                    );
 
@@ -106,6 +106,16 @@ my %UNSUPPORTED = (Composition_Exclusion => 1,
 # missing.
 my %MISSING_FROM_UCHAR;
 
+# Additional property aliases beyond short and long names,
+# like space in addition to WSpace and White_Space in Unicode 4.1.
+# Hashtable, maps long name to alias.
+# For example, maps White_Space->space.
+#
+# If multiple additional aliases are defined,
+# then they are separated in the value string with '|'.
+# For example, White_Space->space|outer_space
+my %additional_property_aliases;
+
 #----------------------------------------------------------------------
 
 # Emitted class names
@@ -298,7 +308,7 @@ END
                 $i = $groupToInt{$groupString};
             } else {
                 my @names = split(/\|/, $groupString);
-                die "Error: Wrong number of names in " . $groupString if (@names != 2);
+                die "Error: Wrong number of names in " . $groupString if (@names < 1);
                 $i = @nameGroups; # index of group we are making 
                 $groupToInt{$groupString} = $i; # Cache for reuse
                 push @nameGroups, map { $stringToID{$_} } @names;
@@ -314,6 +324,7 @@ END
 
     print "int32_t NAME_GROUP[] = {\n";
     # emit one group per line, with annotations
+    my $max_names = 0;
     for (my $i=0; $i<@nameGroups; ) {
         my @a;
         my $line;
@@ -329,12 +340,14 @@ END
               ' 'x(20-length($line)),
               "/* ", sprintf("%3d", $start),
               ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
+        $max_names = @a if(@a > $max_names);
+          
     }
     print "};\n\n";
     
     # This is fixed for 3.2 at "2" but should be calculated dynamically
     # when more than 2 names appear in Property[Value]Aliases.txt.
-    print "#define MAX_NAMES_PER_GROUP 2\n\n";
+    print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
 
     #------------------------------------------------------------
     # Emit enumerated property values
@@ -416,7 +429,9 @@ sub readAndMerge {
     my $pa = {};
     read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
     read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
-    my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt");
+    my $va = {};
+    read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt");
+    read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt");
     
     # Extract property family hash
     my $fam = $pa->{'_family'};
@@ -575,14 +590,29 @@ sub merge_PropertyAliases {
 
     for my $subh (map { $h->{$_} } @TOP) {
         for my $enum (keys %$subh) {
-            my $name = $subh->{$enum};
-            die "Error: Property $name not found (or used more than once)"
-                unless (exists $pa->{$name});
+            my $long_name = $subh->{$enum};
+            if (!exists $pa->{$long_name}) {
+                die "Error: Property $long_name not found (or used more than once)";
+            }
 
-            $subh->{$enum} = $pa->{$name} . "|" . $name;
-            delete $pa->{$name};
+            my $value;
+            if($pa->{$long_name} =~ m|^n/a\d*$|) {
+                # replace an "n/a" short name with an empty name (nothing before "|");
+                # don't remove it (don't remove the "|"): there must always be a long name,
+                # and if the short name is removed, then the long name becomes the
+                # short name and there is no long name left (unless there is another alias)
+                $value = "|" . $long_name;
+            } else {
+                $value = $pa->{$long_name} . "|" . $long_name;
+            }
+            if (exists $additional_property_aliases{$long_name}) {
+                $value .= "|" . $additional_property_aliases{$long_name};
+            }
+            $subh->{$enum} = $value;
+            delete $pa->{$long_name};
         }
     }
+
     my @err;
     for my $name (keys %$pa) {
         $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
@@ -640,7 +670,7 @@ sub merge_PropertyValueAliases {
             # look up both long and short & ignore case
             my $n;
             if (exists $pva->{$name}) {
-                $n = $name;
+                $n = $name; 
             } else {
                 # iterate (slow)
                 for my $a (keys %$pva) {
@@ -653,7 +683,7 @@ sub merge_PropertyValueAliases {
                     }
                 }
             }
-
+                
             # For blocks, do a loose match from Blocks.txt pseudo-name
             # to PropertyValueAliases long name.
             if (!$n && $prop eq 'blk') {
@@ -673,9 +703,9 @@ sub merge_PropertyValueAliases {
 
             my $l = $n;
             my $r = $pva->{$n};
-            # convert |n/a\d+| to blank
-            $l = '' if ($l =~ m|^n/a\d+$|);
-            $r = '' if ($r =~ m|^n/a\d+$|);
+            # convert |n/a\d*| to blank
+            $l = '' if ($l =~ m|^n/a\d*$|);
+            $r = '' if ($r =~ m|^n/a\d*$|);
 
             $hh->{$enum} = "$l|$r";
             # Don't delete the 'gc' properties because we need to share
@@ -689,11 +719,14 @@ sub merge_PropertyValueAliases {
     }
 
     # Merge the combining class values in manually
+    # Add the same values to the synthetic lccc and tccc properties
     die "Error: No ccc data"
         unless exists $va->{'ccc'};
     for my $ccc (keys %{$va->{'ccc'}}) {
         die "Error: Can't overwrite ccc $ccc"
             if (exists $h->{'ccc'}->{$ccc});
+        $h->{'lccc'}->{$ccc} =
+        $h->{'tccc'}->{$ccc} =
         $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
     }
     delete $va->{'ccc'};
@@ -766,14 +799,42 @@ sub read_PropertyAliases {
         s/\#.*//;
         next unless (/\S/);
 
-        if (/^\s*(.+?)\s*;\s*(.+?)\s*$/i) {
-            die "Error: Duplicate property $1 in $filename"
-                if (exists $hash->{$2});
-            $hash->{$2} = $1;
-            $fam->{$2} = $family;
-        }
+        if (/^\s*(.+?)\s*;/) {
+            my $short = $1;
+            my @fields = /;\s*([^\s;]+)/g;
+            if (@fields < 1 || @fields > 2) {
+                my $number = @fields;
+                die "Error: Wrong number of fields ($number) in $filename at $_";
+            }
 
-        else {
+            # Make "n/a" strings unique
+            if ($short eq 'n/a') {
+                $short .= sprintf("%03d", $propNA++);
+            }
+            my $long = $fields[0];
+            if ($long eq 'n/a') {
+                $long .= sprintf("%03d", $propNA++);
+            }
+
+            # Add long name->short name to the hash=pa hash table
+            if (exists $hash->{$long}) {
+                die "Error: Duplicate property $long in $filename"
+            }
+            $hash->{$long} = $short;
+            $fam->{$long} = $family;
+
+            # Add the list of further aliases to the additional_property_aliases hash table,
+            # using the long property name as the key.
+            # For example:
+            #   White_Space->space|outer_space
+            if (@fields > 1) {
+                my $value = pop @fields;
+                while (@fields > 1) {
+                    $value .= "|" . pop @fields;
+                }
+                $additional_property_aliases{$long} = $value;
+            }
+        } else {
             die "Error: Can't parse $_ in $filename";
         }
     }
@@ -795,15 +856,13 @@ sub read_PropertyAliases {
 # @return a hash reference.
 sub read_PropertyValueAliases {
 
-    my $filename = shift; 
+    my $hash = shift;         # result
 
-    my $hash = {};         # result
+    my $filename = shift; 
 
     my $in = new FileHandle($filename, 'r');
     die "Error: Cannot open $filename" if (!defined $in);
 
-    my $sym = 0; # Used to make "n/a" strings unique
-
     while (<$in>) {
 
         # Read version (embedded in a comment)
@@ -820,10 +879,10 @@ sub read_PropertyValueAliases {
         if (/^\s*(.+?)\s*;/i) {
             my $prop = $1;
             my @fields = /;\s*([^\s;]+)/g;
-            die "Error: Wrong number of fields"
+            die "Error: Wrong number of fields in $filename"
                 if (@fields < 2 || @fields > 3);
             # Make "n/a" strings unique
-            $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a');
+            $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
             # Squash extra fields together
             while (@fields > 2) {
                 my $f = pop @fields;
@@ -839,18 +898,21 @@ sub read_PropertyValueAliases {
 
     $in->close();
 
-    # Script Qaac (Coptic) is a special case.  Handle it here.  See UTR#24:
-    # http://www.unicode.org/unicode/reports/tr24/
+    # Script Copt=Qaac (Coptic) is a special case.
+    # Before the Copt code was defined, the private-use code Qaac was used.
+    # Starting with Unicode 4.1, PropertyValueAliases.txt contains
+    # Copt as the short name as well as Qaac as an alias.
+    # For use with older Unicode data files, we add here a Qaac->Coptic entry.
+    # This should not do anything for 4.1-and-later Unicode data files.
+    # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/
     $hash->{'sc'}->{'Qaac'} = 'Coptic'
-        unless (exists $hash->{'sc'}->{'Qaac'});
+        unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'});
 
     # Add T|True and F|False -- these are values we recognize for
     # binary properties (NOT from PropertyValueAliases.txt).  These
     # are of the same form as the 'ccc' value aliases.
     $hash->{'binprop'}->{'0'} = 'F|False';
     $hash->{'binprop'}->{'1'} = 'T|True';
-
-    $hash;
 }
 
 #----------------------------------------------------------------------
@@ -897,7 +959,7 @@ sub read_Blocks {
         s/\#.*//;
         next unless (/\S/);
 
-        if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) {
+        if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) {
             die "Error: Duplicate range $1 in $filename"
                 if (exists $hash->{$1});
             $hash->{$1} = $2;
@@ -1130,6 +1192,24 @@ sub read_uchar {
             }
         }
 
+        elsif ($mode eq 'UGraphemeClusterBreak') {
+            if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) {
+                addDatum($hash, 'GCB', $1, $2);
+            }
+        }
+
+        elsif ($mode eq 'UWordBreakValues') {
+            if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) {
+                addDatum($hash, 'WB', $1, $2);
+            }
+        }
+
+        elsif ($mode eq 'USentenceBreak') {
+            if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) {
+                addDatum($hash, 'SB', $1, $2);
+            }
+        }
+
         elsif ($mode eq 'ULineBreak') {
             if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
                 addDatum($hash, 'lb', $1, $2);
@@ -1196,6 +1276,25 @@ sub read_uchar {
 
     $in->close();
 
+    # hardcode known values for the normalization quick check properties
+    # see unorm.h for the UNormalizationCheckResult enum
+
+    addDatum($hash, 'NFC_QC', 'UNORM_NO',    'N');
+    addDatum($hash, 'NFC_QC', 'UNORM_YES',   'Y');
+    addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
+
+    addDatum($hash, 'NFKC_QC', 'UNORM_NO',    'N');
+    addDatum($hash, 'NFKC_QC', 'UNORM_YES',   'Y');
+    addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
+
+    # no "maybe" values for NF[K]D
+
+    addDatum($hash, 'NFD_QC', 'UNORM_NO',    'N');
+    addDatum($hash, 'NFD_QC', 'UNORM_YES',   'Y');
+
+    addDatum($hash, 'NFKD_QC', 'UNORM_NO',    'N');
+    addDatum($hash, 'NFKD_QC', 'UNORM_YES',   'Y');
+
     $hash;
 }