X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..d5d484b0fbe924d3663b177965538d517ee412c1:/icuSources/tools/genpname/preparse.pl?ds=inline diff --git a/icuSources/tools/genpname/preparse.pl b/icuSources/tools/genpname/preparse.pl index e7532ae3..3e6c9704 100644 --- a/icuSources/tools/genpname/preparse.pl +++ b/icuSources/tools/genpname/preparse.pl @@ -1,7 +1,7 @@ #!/bin/perl -w #******************************************************************* # COPYRIGHT: -# Copyright (c) 2002-2003, International Business Machines Corporation and +# Copyright (c) 2002-2006, International Business Machines Corporation and # others. All Rights Reserved. #******************************************************************* @@ -53,7 +53,7 @@ use FileHandle; use strict; use Dumpvalue; -my $DEBUG = 0; +my $DEBUG = 1; my $DUMPER = new Dumpvalue; my $count = @ARGV; @@ -65,6 +65,10 @@ my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata"; # Get the current year from the system my $YEAR = 1900+@{[localtime]}[5]; # Get the current year +# Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique +my $propNA = 0; +my $valueNA = 0; + #---------------------------------------------------------------------- # Top level property keys for binary, enumerated, string, and double props my @TOP = qw( _bp _ep _sp _dp _mp ); @@ -94,10 +98,6 @@ my %UNSUPPORTED = (Composition_Exclusion => 1, Expands_On_NFKD => 1, FC_NFKC_Closure => 1, ID_Start_Exceptions => 1, - NFC_Quick_Check => 1, - NFD_Quick_Check => 1, - NFKC_Quick_Check => 1, - NFKD_Quick_Check => 1, Special_Case_Condition => 1, ); @@ -106,6 +106,16 @@ my %UNSUPPORTED = (Composition_Exclusion => 1, # missing. my %MISSING_FROM_UCHAR; +# Additional property aliases beyond short and long names, +# like space in addition to WSpace and White_Space in Unicode 4.1. +# Hashtable, maps long name to alias. +# For example, maps White_Space->space. +# +# If multiple additional aliases are defined, +# then they are separated in the value string with '|'. +# For example, White_Space->space|outer_space +my %additional_property_aliases; + #---------------------------------------------------------------------- # Emitted class names @@ -298,7 +308,7 @@ END $i = $groupToInt{$groupString}; } else { my @names = split(/\|/, $groupString); - die "Error: Wrong number of names in " . $groupString if (@names != 2); + die "Error: Wrong number of names in " . $groupString if (@names < 1); $i = @nameGroups; # index of group we are making $groupToInt{$groupString} = $i; # Cache for reuse push @nameGroups, map { $stringToID{$_} } @names; @@ -314,6 +324,7 @@ END print "int32_t NAME_GROUP[] = {\n"; # emit one group per line, with annotations + my $max_names = 0; for (my $i=0; $i<@nameGroups; ) { my @a; my $line; @@ -329,12 +340,14 @@ END ' 'x(20-length($line)), "/* ", sprintf("%3d", $start), ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n"; + $max_names = @a if(@a > $max_names); + } print "};\n\n"; # This is fixed for 3.2 at "2" but should be calculated dynamically # when more than 2 names appear in Property[Value]Aliases.txt. - print "#define MAX_NAMES_PER_GROUP 2\n\n"; + print "#define MAX_NAMES_PER_GROUP $max_names\n\n"; #------------------------------------------------------------ # Emit enumerated property values @@ -416,7 +429,9 @@ sub readAndMerge { my $pa = {}; read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt"); read_PropertyAliases($pa, "SyntheticPropertyAliases.txt"); - my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt"); + my $va = {}; + read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt"); + read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt"); # Extract property family hash my $fam = $pa->{'_family'}; @@ -575,14 +590,29 @@ sub merge_PropertyAliases { for my $subh (map { $h->{$_} } @TOP) { for my $enum (keys %$subh) { - my $name = $subh->{$enum}; - die "Error: Property $name not found (or used more than once)" - unless (exists $pa->{$name}); + my $long_name = $subh->{$enum}; + if (!exists $pa->{$long_name}) { + die "Error: Property $long_name not found (or used more than once)"; + } - $subh->{$enum} = $pa->{$name} . "|" . $name; - delete $pa->{$name}; + my $value; + if($pa->{$long_name} =~ m|^n/a\d*$|) { + # replace an "n/a" short name with an empty name (nothing before "|"); + # don't remove it (don't remove the "|"): there must always be a long name, + # and if the short name is removed, then the long name becomes the + # short name and there is no long name left (unless there is another alias) + $value = "|" . $long_name; + } else { + $value = $pa->{$long_name} . "|" . $long_name; + } + if (exists $additional_property_aliases{$long_name}) { + $value .= "|" . $additional_property_aliases{$long_name}; + } + $subh->{$enum} = $value; + delete $pa->{$long_name}; } } + my @err; for my $name (keys %$pa) { $MISSING_FROM_UCHAR{$pa->{$name}} = 1; @@ -640,7 +670,7 @@ sub merge_PropertyValueAliases { # look up both long and short & ignore case my $n; if (exists $pva->{$name}) { - $n = $name; + $n = $name; } else { # iterate (slow) for my $a (keys %$pva) { @@ -653,7 +683,7 @@ sub merge_PropertyValueAliases { } } } - + # For blocks, do a loose match from Blocks.txt pseudo-name # to PropertyValueAliases long name. if (!$n && $prop eq 'blk') { @@ -673,9 +703,9 @@ sub merge_PropertyValueAliases { my $l = $n; my $r = $pva->{$n}; - # convert |n/a\d+| to blank - $l = '' if ($l =~ m|^n/a\d+$|); - $r = '' if ($r =~ m|^n/a\d+$|); + # convert |n/a\d*| to blank + $l = '' if ($l =~ m|^n/a\d*$|); + $r = '' if ($r =~ m|^n/a\d*$|); $hh->{$enum} = "$l|$r"; # Don't delete the 'gc' properties because we need to share @@ -689,11 +719,14 @@ sub merge_PropertyValueAliases { } # Merge the combining class values in manually + # Add the same values to the synthetic lccc and tccc properties die "Error: No ccc data" unless exists $va->{'ccc'}; for my $ccc (keys %{$va->{'ccc'}}) { die "Error: Can't overwrite ccc $ccc" if (exists $h->{'ccc'}->{$ccc}); + $h->{'lccc'}->{$ccc} = + $h->{'tccc'}->{$ccc} = $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc}; } delete $va->{'ccc'}; @@ -766,14 +799,42 @@ sub read_PropertyAliases { s/\#.*//; next unless (/\S/); - if (/^\s*(.+?)\s*;\s*(.+?)\s*$/i) { - die "Error: Duplicate property $1 in $filename" - if (exists $hash->{$2}); - $hash->{$2} = $1; - $fam->{$2} = $family; - } + if (/^\s*(.+?)\s*;/) { + my $short = $1; + my @fields = /;\s*([^\s;]+)/g; + if (@fields < 1 || @fields > 2) { + my $number = @fields; + die "Error: Wrong number of fields ($number) in $filename at $_"; + } - else { + # Make "n/a" strings unique + if ($short eq 'n/a') { + $short .= sprintf("%03d", $propNA++); + } + my $long = $fields[0]; + if ($long eq 'n/a') { + $long .= sprintf("%03d", $propNA++); + } + + # Add long name->short name to the hash=pa hash table + if (exists $hash->{$long}) { + die "Error: Duplicate property $long in $filename" + } + $hash->{$long} = $short; + $fam->{$long} = $family; + + # Add the list of further aliases to the additional_property_aliases hash table, + # using the long property name as the key. + # For example: + # White_Space->space|outer_space + if (@fields > 1) { + my $value = pop @fields; + while (@fields > 1) { + $value .= "|" . pop @fields; + } + $additional_property_aliases{$long} = $value; + } + } else { die "Error: Can't parse $_ in $filename"; } } @@ -795,15 +856,13 @@ sub read_PropertyAliases { # @return a hash reference. sub read_PropertyValueAliases { - my $filename = shift; + my $hash = shift; # result - my $hash = {}; # result + my $filename = shift; my $in = new FileHandle($filename, 'r'); die "Error: Cannot open $filename" if (!defined $in); - my $sym = 0; # Used to make "n/a" strings unique - while (<$in>) { # Read version (embedded in a comment) @@ -820,10 +879,10 @@ sub read_PropertyValueAliases { if (/^\s*(.+?)\s*;/i) { my $prop = $1; my @fields = /;\s*([^\s;]+)/g; - die "Error: Wrong number of fields" + die "Error: Wrong number of fields in $filename" if (@fields < 2 || @fields > 3); # Make "n/a" strings unique - $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a'); + $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a'); # Squash extra fields together while (@fields > 2) { my $f = pop @fields; @@ -839,18 +898,21 @@ sub read_PropertyValueAliases { $in->close(); - # Script Qaac (Coptic) is a special case. Handle it here. See UTR#24: - # http://www.unicode.org/unicode/reports/tr24/ + # Script Copt=Qaac (Coptic) is a special case. + # Before the Copt code was defined, the private-use code Qaac was used. + # Starting with Unicode 4.1, PropertyValueAliases.txt contains + # Copt as the short name as well as Qaac as an alias. + # For use with older Unicode data files, we add here a Qaac->Coptic entry. + # This should not do anything for 4.1-and-later Unicode data files. + # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/ $hash->{'sc'}->{'Qaac'} = 'Coptic' - unless (exists $hash->{'sc'}->{'Qaac'}); + unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'}); # Add T|True and F|False -- these are values we recognize for # binary properties (NOT from PropertyValueAliases.txt). These # are of the same form as the 'ccc' value aliases. $hash->{'binprop'}->{'0'} = 'F|False'; $hash->{'binprop'}->{'1'} = 'T|True'; - - $hash; } #---------------------------------------------------------------------- @@ -897,7 +959,7 @@ sub read_Blocks { s/\#.*//; next unless (/\S/); - if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) { + if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) { die "Error: Duplicate range $1 in $filename" if (exists $hash->{$1}); $hash->{$1} = $2; @@ -1130,6 +1192,24 @@ sub read_uchar { } } + elsif ($mode eq 'UGraphemeClusterBreak') { + if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) { + addDatum($hash, 'GCB', $1, $2); + } + } + + elsif ($mode eq 'UWordBreakValues') { + if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) { + addDatum($hash, 'WB', $1, $2); + } + } + + elsif ($mode eq 'USentenceBreak') { + if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) { + addDatum($hash, 'SB', $1, $2); + } + } + elsif ($mode eq 'ULineBreak') { if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) { addDatum($hash, 'lb', $1, $2); @@ -1196,6 +1276,25 @@ sub read_uchar { $in->close(); + # hardcode known values for the normalization quick check properties + # see unorm.h for the UNormalizationCheckResult enum + + addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N'); + addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y'); + addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M'); + + addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N'); + addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y'); + addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M'); + + # no "maybe" values for NF[K]D + + addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N'); + addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y'); + + addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N'); + addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y'); + $hash; }