icuSources/tools/genpname/preparse.pl

   1 #!/bin/perl -w
   2 #*******************************************************************
   3 # COPYRIGHT:
   4 # Copyright (c) 2002-2004, International Business Machines Corporation and
   5 # others. All Rights Reserved.
   6 #*******************************************************************
   7
   8 # This script reads in UCD files PropertyAliases.txt and
   9 # PropertyValueAliases.txt and correlates them with ICU enums
  10 # defined in uchar.h and uscript.h.  It then outputs a header
  11 # file which contains all names and enums.  The header is included
  12 # by the genpname tool C++ source file, which produces the actual
  13 # binary data file.
  14 #
  15 # See usage note below.
  16 #
  17 # TODO: The Property[Value]Alias.txt files state that they can support
  18 # more than 2 names per property|value.  Currently (Unicode 3.2) there
  19 # are always 1 or 2 names.  If more names were supported, presumably
  20 # the format would be something like:
  21 #    nv        ; Numeric_Value
  22 #    nv        ; Value_Numerique
  23 # CURRENTLY, this script assumes that there are 1 or two names.  Any
  24 # duplicates it sees are flagged as an error.  If multiple aliases
  25 # appear in a future version of Unicode, modify this script to support
  26 # that.
  27 #
  28 # NOTE: As of ICU 2.6, this script has been modified to know about the
  29 # pseudo-property gcm/General_Category_Mask, which corresponds to the
  30 # uchar.h property UCHAR_GENERAL_CATEGORY_MASK.  This property
  31 # corresponds to General_Category but is a bitmask value.  It does not
  32 # exist in the UCD.  Therefore, I special case it in several places
  33 # (search for General_Category_Mask and gcm).
  34 #
  35 # NOTE: As of ICU 2.6, this script reads an auxiliary data file,
  36 # SyntheticPropertyAliases.txt, containing property aliases not
  37 # present in the UCD but present in ICU.  This file resides in the
  38 # same directory as this script.  Its contents are merged into those
  39 # of PropertyAliases.txt as if the two files were appended.
  40 #
  41 # NOTE: The following names are handled specially.  See script below
  42 # for details.
  43 #
  44 #   T/True
  45 #   F/False
  46 #   No_Block
  47 #
  48 # Author: Alan Liu
  49 # Created: October 14 2002
  50 # Since: ICU 2.4
  51
  52 use FileHandle;
  53 use strict;
  54 use Dumpvalue;
  55
  56 my $DEBUG = 1;
  57 my $DUMPER = new Dumpvalue;
  58
  59 my $count = @ARGV;
  60 my $ICU_DIR = shift() || '';
  61 my $OUT_FILE = shift() || 'data.h';
  62 my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
  63 my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
  64
  65 # Get the current year from the system
  66 my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
  67
  68 #----------------------------------------------------------------------
  69 # Top level property keys for binary, enumerated, string, and double props
  70 my @TOP     = qw( _bp _ep _sp _dp _mp );
  71
  72 # This hash governs how top level properties are grouped into output arrays.
  73 #my %TOP_PROPS = ( "VALUED"   => [ '_bp', '_ep' ],
  74 #                  "NO_VALUE" => [ '_sp', '_dp' ] );m
  75 #my %TOP_PROPS = ( "BINARY"   => [ '_bp' ],
  76 #                  "ENUMERATED" => [ '_ep' ],
  77 #                  "STRING" => [ '_sp' ],
  78 #                  "DOUBLE" => [ '_dp' ] );
  79 my %TOP_PROPS = ( ""   => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
  80
  81 my %PROP_TYPE = (Binary => "_bp",
  82                  String => "_sp",
  83                  Double => "_dp",
  84                  Enumerated => "_ep",
  85                  Bitmask => "_mp");
  86 #----------------------------------------------------------------------
  87
  88 # Properties that are unsupported in ICU
  89 my %UNSUPPORTED = (Composition_Exclusion => 1,
  90                    Decomposition_Mapping => 1,
  91                    Expands_On_NFC => 1,
  92                    Expands_On_NFD => 1,
  93                    Expands_On_NFKC => 1,
  94                    Expands_On_NFKD => 1,
  95                    FC_NFKC_Closure => 1,
  96                    ID_Start_Exceptions => 1,
  97                    Special_Case_Condition => 1,
  98                    );
  99
 100 # Short names of properties that weren't seen in uchar.h.  If the
 101 # properties weren't seen, don't complain about the property values
 102 # missing.
 103 my %MISSING_FROM_UCHAR;
 104
 105 #----------------------------------------------------------------------
 106
 107 # Emitted class names
 108 my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
 109
 110 if ($count < 1 || $count > 2 ||
 111     !-d $HEADER_DIR ||
 112     !-d $UNIDATA_DIR) {
 113     my $me = $0;
 114     $me =~ s|.+[/\\]||;
 115     my $lm = ' ' x length($me);
 116     print <<"END";
 117
 118 $me: Reads ICU4C headers and Unicode data files and creates
 119 $lm  a C header file that is included by genpname.  The header
 120 $lm  file matches constants defined in the ICU4C headers with
 121 $lm  property|value aliases in the Unicode data files.
 122
 123 Usage: $me <icu_dir> [<out_file>]
 124
 125 <icu_dir>   ICU4C root directory, containing
 126                source/common/unicode/uchar.h
 127                source/common/unicode/uscript.h
 128                source/data/unidata/Blocks.txt
 129                source/data/unidata/PropertyAliases.txt
 130                source/data/unidata/PropertyValueAliases.txt
 131 <out_file>  File name of header to be written;
 132             default is 'data.h'.
 133
 134 The Unicode versions of all input files must match.
 135 END
 136     exit(1);
 137 }
 138
 139 my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
 140
 141 if ($DEBUG) {
 142     print "Merged hash:\n";
 143     for my $key (sort keys %$h) {
 144         my $hh = $h->{$key};
 145         for my $subkey (sort keys %$hh) {
 146             print "$key:$subkey:", $hh->{$subkey}, "\n";
 147         }
 148     }
 149 }
 150
 151 my $out = new FileHandle($OUT_FILE, 'w');
 152 die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
 153 my $save = select($out);
 154 formatData($h, $version);
 155 select($save);
 156 $out->close();
 157
 158 exit(0);
 159
 160 #----------------------------------------------------------------------
 161 # From PropList.html: "The properties of the form Other_XXX
 162 # are used to generate properties in DerivedCoreProperties.txt.
 163 # They are not intended for general use, such as in APIs that
 164 # return property values.
 165 # Non_Break is not a valid property as of 3.2.
 166 sub isIgnoredProperty {
 167     local $_ = shift;
 168     /^Other_/i || /^Non_Break$/i;
 169 }
 170
 171 # 'qc' is a pseudo-property matching any quick-check property
 172 # see PropertyValueAliases.txt file comments.  'binprop' is
 173 # a synthetic binary value alias "True"/"False", not present
 174 # in PropertyValueAliases.txt.
 175 sub isPseudoProperty {
 176     $_[0] eq 'qc' ||
 177         $_[0] eq 'binprop';
 178 }
 179
 180 #----------------------------------------------------------------------
 181 # Emit the combined data from headers and the Unicode database as a
 182 # C source code header file.
 183 #
 184 # @param ref to hash with the data
 185 # @param Unicode version, as a string
 186 sub formatData {
 187     my $h = shift;
 188     my $version = shift;
 189
 190     my $date = scalar localtime();
 191     print <<"END";
 192 /**
 193  * Copyright (C) 2002-$YEAR, International Business Machines Corporation and
 194  * others. All Rights Reserved.
 195  *
 196  * MACHINE GENERATED FILE.  !!! Do not edit manually !!!
 197  *
 198  * Generated from
 199  *   uchar.h
 200  *   uscript.h
 201  *   Blocks.txt
 202  *   PropertyAliases.txt
 203  *   PropertyValueAliases.txt
 204  *
 205  * Date: $date
 206  * Unicode version: $version
 207  * Script: $0
 208  */
 209
 210 END
 211
 212     #------------------------------------------------------------
 213     # Emit Unicode version
 214     print "/* Unicode version $version */\n";
 215     my @v = split(/\./, $version);
 216     push @v, '0' while (@v < 4);
 217     for (my $i=0; $i<@v; ++$i) {
 218         print "const uint8_t VERSION_$i = $v[$i];\n";
 219     }
 220     print "\n";
 221
 222     #------------------------------------------------------------
 223     # Emit String table
 224     # [A table of all identifiers, that is, all long or short property
 225     # or value names.  The list need NOT be sorted; it will be sorted
 226     # by the C program.  Strings are referenced by their index into
 227     # this table.  After sorting, a REMAP[] array is used to map the
 228     # old position indices to the new positions.]
 229     my %strings;
 230     for my $prop (sort keys %$h) {
 231         my $hh = $h->{$prop};
 232         for my $enum (sort keys %$hh) {
 233             my @a = split(/\|/, $hh->{$enum});
 234             for (@a) {
 235                 $strings{$_} = 1 if (length($_));
 236             }
 237         }
 238     }
 239     my @strings = sort keys %strings;
 240     unshift @strings, "";
 241
 242     print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";
 243
 244     # while printing, create a mapping hash from string table entry to index
 245     my %stringToID;
 246     print "/* to be sorted */\n";
 247     print "const $STRING_CLASS STRING_TABLE[] = {\n";
 248     for (my $i=0; $i<@strings; ++$i) {
 249         print "    $STRING_CLASS(\"$strings[$i]\", $i),\n";
 250         $stringToID{$strings[$i]} = $i;
 251     }
 252     print "};\n\n";
 253
 254     # placeholder for the remapping index.  this is used to map
 255     # indices that we compute here to indices of the sorted
 256     # STRING_TABLE.  STRING_TABLE will be sorted by the C++ program
 257     # using the uprv_comparePropertyNames() function.  this will
 258     # reshuffle the order.  we then use the indices (passed to the
 259     # String constructor) to create a REMAP[] array.
 260     print "/* to be filled in */\n";
 261     print "int32_t REMAP[", scalar @strings, "];\n\n";
 262
 263     #------------------------------------------------------------
 264     # Emit the name group table
 265     # [A table of name groups.  A name group is one or more names
 266     # for a property or property value.  The Unicode data files specify
 267     # that there may be more than 2, although as of Unicode 3.2 there
 268     # are at most 2.  The name group table looks like this:
 269     #
 270     #  114, -115, 116, -117, 0, -118, 65, -64, ...
 271     #  [0]        [2]        [4]      [6]
 272     #
 273     # The entry at [0] consists of 2 strings, 114 and 115.
 274     # The entry at [2] consists of 116 and 117.  The entry at
 275     # [4] is one string, 118.  There is always at least one
 276     # string; typically there are two.  If there are two, the first
 277     # is the SHORT name and the second is the LONG.  If there is
 278     # one, then the missing entry (always the short name, in 3.2)
 279     # is zero, which is by definition the index of "".  The
 280     # 'preferred' name will generally be the LONG name, if there are
 281     # more than 2 entries.  The last entry is negative.
 282
 283     # Build name group list and replace string refs with nameGroup indices
 284     my @nameGroups;
 285
 286     # Check for duplicate name groups, and reuse them if possible
 287     my %groupToInt; # Map group strings to ints
 288     for my $prop (sort keys %$h) {
 289         my $hh = $h->{$prop};
 290         for my $enum (sort keys %$hh) {
 291             my $groupString = $hh->{$enum};
 292             my $i;
 293             if (exists $groupToInt{$groupString}) {
 294                 $i = $groupToInt{$groupString};
 295             } else {
 296                 my @names = split(/\|/, $groupString);
 297                 die "Error: Wrong number of names in " . $groupString if (@names < 2);
 298                 $i = @nameGroups; # index of group we are making
 299                 $groupToInt{$groupString} = $i; # Cache for reuse
 300                 push @nameGroups, map { $stringToID{$_} } @names;
 301                 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
 302             }
 303             # now, replace string list with ref to name group
 304             $hh->{$enum} = $i;
 305         }
 306     }
 307
 308     print "const int32_t NAME_GROUP_COUNT = ",
 309           scalar @nameGroups, ";\n\n";
 310
 311     print "int32_t NAME_GROUP[] = {\n";
 312     # emit one group per line, with annotations
 313     my $max_names = 0;
 314     for (my $i=0; $i<@nameGroups; ) {
 315         my @a;
 316         my $line;
 317         my $start = $i;
 318         for (;;) {
 319             my $j = $nameGroups[$i++];
 320             $line .= "$j, ";
 321             push @a, abs($j);
 322             last if ($j < 0);
 323         }
 324         print "    ",
 325               $line,
 326               ' 'x(20-length($line)),
 327               "/* ", sprintf("%3d", $start),
 328               ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
 329         $max_names = @a if(@a > $max_names);
 330
 331     }
 332     print "};\n\n";
 333
 334     # This is fixed for 3.2 at "2" but should be calculated dynamically
 335     # when more than 2 names appear in Property[Value]Aliases.txt.
 336     print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
 337
 338     #------------------------------------------------------------
 339     # Emit enumerated property values
 340     for my $prop (sort keys %$h) {
 341         next if ($prop =~ /^_/);
 342         my $vh = $h->{$prop};
 343         my $count = scalar keys %$vh;
 344
 345         print "const int32_t VALUES_${prop}_COUNT = ",
 346               $count, ";\n\n";
 347
 348         print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
 349         for my $enum (sort keys %$vh) {
 350             #my @names = split(/\|/, $vh->{$enum});
 351             #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
 352             #    if (@names != 2);
 353             print "    $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
 354                   #$stringToID{$names[0]}, ", ",
 355                   #$stringToID{$names[1]}, "),\n";
 356             #      "\"", $names[0], "\", ",
 357             #      "\"", $names[1], "\"),\n";
 358         }
 359         print "};\n\n";
 360     }
 361
 362     #------------------------------------------------------------
 363     # Emit top-level properties (binary, enumerated, etc.)
 364     for my $topName (sort keys %TOP_PROPS) {
 365         my $a = $TOP_PROPS{$topName};
 366         my $count = 0;
 367         for my $type (@$a) { # "_bp", "_ep", etc.
 368             $count += scalar keys %{$h->{$type}};
 369         }
 370
 371         print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
 372
 373         print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
 374
 375         for my $type (@$a) { # "_bp", "_ep", etc.
 376             my $p = $h->{$type};
 377
 378             for my $enum (sort keys %$p) {
 379                 my $name = $strings[$nameGroups[$p->{$enum}]];
 380
 381                 my $valueRef = "0, NULL";
 382                 if ($type eq '_bp') {
 383                     $valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
 384                 }
 385                 elsif (exists $h->{$name}) {
 386                     $valueRef = "VALUES_${name}_COUNT, VALUES_$name";
 387                 }
 388
 389                 print "    $PROPERTY_CLASS((int32_t) $enum, ",
 390                       $p->{$enum}, ", $valueRef),\n";
 391             }
 392         }
 393         print "};\n\n";
 394     }
 395
 396     print "/*eof*/\n";
 397 }
 398
 399 #----------------------------------------------------------------------
 400 # Read in the files uchar.h, uscript.h, Blocks.txt,
 401 # PropertyAliases.txt, and PropertyValueAliases.txt,
 402 # and combine them into one hash.
 403 #
 404 # @param directory containing headers
 405 # @param directory containin Unicode data files
 406 #
 407 # @return hash ref, Unicode version
 408 sub readAndMerge {
 409
 410     my ($headerDir, $unidataDir) = @_;
 411
 412     my $h = read_uchar("$headerDir/uchar.h");
 413     my $s = read_uscript("$headerDir/uscript.h");
 414     my $b = read_Blocks("$unidataDir/Blocks.txt");
 415     my $pa = {};
 416     read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
 417     read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
 418     my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt");
 419
 420     # Extract property family hash
 421     my $fam = $pa->{'_family'};
 422     delete $pa->{'_family'};
 423
 424     # Note: uscript.h has no version string, so don't check it
 425     my $version = check_versions([ 'uchar.h', $h ],
 426                                  [ 'Blocks.txt', $b ],
 427                                  [ 'PropertyAliases.txt', $pa ],
 428                                  [ 'PropertyValueAliases.txt', $va ]);
 429
 430     # Do this BEFORE merging; merging modifies the hashes
 431     check_PropertyValueAliases($pa, $va);
 432
 433     # Dump out the $va hash for debugging
 434     if ($DEBUG) {
 435         print "Property values hash:\n";
 436         for my $key (sort keys %$va) {
 437             my $hh = $va->{$key};
 438             for my $subkey (sort keys %$hh) {
 439                 print "$key:$subkey:", $hh->{$subkey}, "\n";
 440             }
 441         }
 442     }
 443
 444     # Dump out the $s hash for debugging
 445     if ($DEBUG) {
 446         print "Script hash:\n";
 447         for my $key (sort keys %$s) {
 448             print "$key:", $s->{$key}, "\n";
 449         }
 450     }
 451
 452     # Link in the script data
 453     $h->{'sc'} = $s;
 454
 455     merge_Blocks($h, $b);
 456
 457     merge_PropertyAliases($h, $pa, $fam);
 458
 459     merge_PropertyValueAliases($h, $va);
 460
 461     ($h, $version);
 462 }
 463
 464 #----------------------------------------------------------------------
 465 # Ensure that the version strings in the given hashes (under the key
 466 # '_version') are compatible.  Currently this means they must be
 467 # identical, with the exception that "X.Y" will match "X.Y.0".
 468 # All hashes must define the key '_version'.
 469 #
 470 # @param a list of pairs of (file name, hash reference)
 471 #
 472 # @return the version of all the hashes.  Upon return, the '_version'
 473 # will be removed from all hashes.
 474 sub check_versions {
 475     my $version = '';
 476     my $msg = '';
 477     foreach my $a (@_) {
 478         my $name = $a->[0];
 479         my $h    = $a->[1];
 480         die "Error: No version found" unless (exists $h->{'_version'});
 481         my $v = $h->{'_version'};
 482         delete $h->{'_version'};
 483
 484         # append ".0" if necessary, to standardize to X.Y.Z
 485         $v .= '.0' unless ($v =~ /\.\d+\./);
 486         $v .= '.0' unless ($v =~ /\.\d+\./);
 487         $msg .= "$name = $v\n";
 488         if ($version) {
 489             die "Error: Mismatched Unicode versions\n$msg"
 490                 unless ($version eq $v);
 491         } else {
 492             $version = $v;
 493         }
 494     }
 495     $version;
 496 }
 497
 498 #----------------------------------------------------------------------
 499 # Make sure the property names in PropertyValueAliases.txt match those
 500 # in PropertyAliases.txt.
 501 #
 502 # @param a hash ref from read_PropertyAliases.
 503 # @param a hash ref from read_PropertyValueAliases.
 504 sub check_PropertyValueAliases {
 505     my ($pa, $va) = @_;
 506
 507     # make a reverse hash of short->long
 508     my %rev;
 509     for (keys %$pa) { $rev{$pa->{$_}} = $_; }
 510
 511     for my $prop (keys %$va) {
 512         if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
 513             print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
 514         }
 515     }
 516 }
 517
 518 #----------------------------------------------------------------------
 519 # Merge blocks data into uchar.h enum data.  In the 'blk' subhash all
 520 # code point values, as returned from read_uchar, are replaced by
 521 # block names, as read from Blocks.txt and returned by read_Blocks.
 522 # The match must be 1-to-1.  If there is any failure of 1-to-1
 523 # mapping, an error is signaled.  Upon return, the read_Blocks hash
 524 # is emptied of all contents, except for those that failed to match.
 525 #
 526 # The mapping in the 'blk' subhash, after this function returns, is
 527 # from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
 528 # pseudo-name, e.g. "Basic Latin".
 529 #
 530 # @param a hash ref from read_uchar.
 531 # @param a hash ref from read_Blocks.
 532 sub merge_Blocks {
 533     my ($h, $b) = @_;
 534
 535     die "Error: No blocks data in uchar.h"
 536         unless (exists $h->{'blk'});
 537     my $blk = $h->{'blk'};
 538     for my $enum (keys %$blk) {
 539         my $cp = $blk->{$enum};
 540         if ($cp && !exists $b->{$cp}) {
 541             die "Error: No block found at $cp in Blocks.txt";
 542         }
 543         # Convert code point to pseudo-name:
 544         $blk->{$enum} = $b->{$cp};
 545         delete $b->{$cp};
 546     }
 547     my $err = '';
 548     for my $cp (keys %$b) {
 549         $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
 550     }
 551     die $err if ($err);
 552 }
 553
 554 #----------------------------------------------------------------------
 555 # Merge property alias names into the uchar.h hash.  The subhashes
 556 # under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
 557 # examined and the values of those subhashes are assumed to be long
 558 # names in PropertyAliases.txt.  They are validated and replaced by
 559 # "<short>|<long>".  Upon return, the read_PropertyAliases hash is
 560 # emptied of all contents, except for those that failed to match.
 561 # Unmatched names in PropertyAliases are listed as a warning but do
 562 # NOT cause the script to die.
 563 #
 564 # @param a hash ref from read_uchar.
 565 # @param a hash ref from read_PropertyAliases.
 566 # @param a hash mapping long names to property family (e.g., 'binary')
 567 sub merge_PropertyAliases {
 568     my ($h, $pa, $fam) = @_;
 569
 570     for my $k (@TOP) {
 571         die "Error: No properties data for $k in uchar.h"
 572             unless (exists $h->{$k});
 573     }
 574
 575     for my $subh (map { $h->{$_} } @TOP) {
 576         for my $enum (keys %$subh) {
 577             my $name = $subh->{$enum};
 578             die "Error: Property $name not found (or used more than once)"
 579                 unless (exists $pa->{$name});
 580
 581             $subh->{$enum} = $pa->{$name} . "|" . $name;
 582             delete $pa->{$name};
 583         }
 584     }
 585     my @err;
 586     for my $name (keys %$pa) {
 587         $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
 588         if (exists $UNSUPPORTED{$name}) {
 589             push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
 590         } elsif (!isIgnoredProperty($name)) {
 591             push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
 592         }
 593     }
 594     print join("\n", sort @err), "\n" if (@err);
 595 }
 596
 597 #----------------------------------------------------------------------
 598 # Return 1 if two names match ignoring whitespace, '-', and '_'.
 599 # Used to match names in Blocks.txt with those in PropertyValueAliases.txt
 600 # as of Unicode 4.0.
 601 sub matchesLoosely {
 602     my ($a, $b) = @_;
 603     $a =~ s/[\s\-_]//g;
 604     $b =~ s/[\s\-_]//g;
 605     $a =~ /^$b$/i;
 606 }
 607
 608 #----------------------------------------------------------------------
 609 # Merge PropertyValueAliases.txt data into the uchar.h hash.  All
 610 # properties other than blk, _bp, and _ep are analyzed and mapped to
 611 # the names listed in PropertyValueAliases.  They are then replaced
 612 # with a string of the form "<short>|<long>".  The short or long name
 613 # may be missing.
 614 #
 615 # @param a hash ref from read_uchar.
 616 # @param a hash ref from read_PropertyValueAliases.
 617 sub merge_PropertyValueAliases {
 618     my ($h, $va) = @_;
 619
 620     my %gcCount;
 621     for my $prop (keys %$h) {
 622         # _bp, _ep handled in merge_PropertyAliases
 623         next if ($prop =~ /^_/);
 624
 625         # Special case: gcm
 626         my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
 627
 628         # find corresponding PropertyValueAliases data
 629         die "Error: Can't find $prop in PropertyValueAliases.txt"
 630             unless (exists $va->{$prop2});
 631         my $pva = $va->{$prop2};
 632
 633         # match up data
 634         my $hh = $h->{$prop};
 635         for my $enum (keys %$hh) {
 636
 637             my $name = $hh->{$enum};
 638
 639             # look up both long and short & ignore case
 640             my $n;
 641             if (exists $pva->{$name}) {
 642                 $n = $name;
 643             } else {
 644                 # iterate (slow)
 645                 for my $a (keys %$pva) {
 646                     # case-insensitive match
 647                     # & case-insensitive reverse match
 648                     if ($a =~ /^$name$/i ||
 649                         $pva->{$a} =~ /^$name$/i) {
 650                         $n = $a;
 651                         last;
 652                     }
 653                 }
 654             }
 655
 656             # For blocks, do a loose match from Blocks.txt pseudo-name
 657             # to PropertyValueAliases long name.
 658             if (!$n && $prop eq 'blk') {
 659                 for my $a (keys %$pva) {
 660                     # The block is only going to match the long name,
 661                     # but we check both for completeness.  As of Unicode
 662                     # 4.0, blocks do not have short names.
 663                     if (matchesLoosely($name, $pva->{$a}) ||
 664                         matchesLoosely($name, $a)) {
 665                         $n = $a;
 666                         last;
 667                     }
 668                 }
 669             }
 670
 671             die "Error: Property value $prop:$name not found" unless ($n);
 672
 673             my $l = $n;
 674             my $r = $pva->{$n};
 675             # convert |n/a\d+| to blank
 676             $l = '' if ($l =~ m|^n/a\d+$|);
 677             $r = '' if ($r =~ m|^n/a\d+$|);
 678
 679             $hh->{$enum} = "$l|$r";
 680             # Don't delete the 'gc' properties because we need to share
 681             # them between 'gc' and 'gcm'.  Count each use instead.
 682             if ($prop2 eq 'gc') {
 683                 ++$gcCount{$n};
 684             } else {
 685                 delete $pva->{$n};
 686             }
 687         }
 688     }
 689
 690     # Merge the combining class values in manually
 691     # Add the same values to the synthetic lccc and tccc properties
 692     die "Error: No ccc data"
 693         unless exists $va->{'ccc'};
 694     for my $ccc (keys %{$va->{'ccc'}}) {
 695         die "Error: Can't overwrite ccc $ccc"
 696             if (exists $h->{'ccc'}->{$ccc});
 697         $h->{'lccc'}->{$ccc} =
 698         $h->{'tccc'}->{$ccc} =
 699         $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
 700     }
 701     delete $va->{'ccc'};
 702
 703     # Merge synthetic binary property values in manually.
 704     # These are the "True" and "False" value aliases.
 705     die "Error: No True/False value aliases"
 706         unless exists $va->{'binprop'};
 707     for my $bp (keys %{$va->{'binprop'}}) {
 708         $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
 709     }
 710     delete $va->{'binprop'};
 711
 712     my $err = '';
 713     for my $prop (sort keys %$va) {
 714         my $hh = $va->{$prop};
 715         for my $subkey (sort keys %$hh) {
 716             # 'gc' props are shared with 'gcm'; make sure they were used
 717             # once or twice.
 718             if ($prop eq 'gc') {
 719                 my $n = $gcCount{$subkey};
 720                 next if ($n >= 1 && $n <= 2);
 721             }
 722             $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
 723                 unless exists $MISSING_FROM_UCHAR{$prop};
 724         }
 725     }
 726     print $err if ($err);
 727 }
 728
 729 #----------------------------------------------------------------------
 730 # Read the PropertyAliases.txt file.  Return a hash that maps the long
 731 # name to the short name.  The special key '_version' will map to the
 732 # Unicode version of the file.  The special key '_family' holds a
 733 # subhash that maps long names to a family string, for descriptive
 734 # purposes.
 735 #
 736 # @param a filename for PropertyAliases.txt
 737 # @param reference to hash to receive data.  Keys are long names.
 738 # Values are short names.
 739 sub read_PropertyAliases {
 740
 741     my $hash = shift;         # result
 742
 743     my $filename = shift;
 744
 745     my $fam = {};  # map long names to family string
 746     $fam = $hash->{'_family'} if (exists $hash->{'_family'});
 747
 748     my $family; # binary, enumerated, etc.
 749
 750     my $in = new FileHandle($filename, 'r');
 751     die "Error: Cannot open $filename" if (!defined $in);
 752
 753     while (<$in>) {
 754
 755         # Read version (embedded in a comment)
 756         if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
 757             die "Error: Multiple versions in $filename"
 758                 if (exists $hash->{'_version'});
 759             $hash->{'_version'} = $1;
 760         }
 761
 762         # Read family heading
 763         if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
 764             $family = $1;
 765         }
 766
 767         # Ignore comments and blank lines
 768         s/\#.*//;
 769         next unless (/\S/);
 770
 771         if (/^\s*(.+?)\s*;\s*(.+?)\s*$/i) {
 772             die "Error: Duplicate property $1 in $filename"
 773                 if (exists $hash->{$2});
 774             $hash->{$2} = $1;
 775             $fam->{$2} = $family;
 776         }
 777
 778         else {
 779             die "Error: Can't parse $_ in $filename";
 780         }
 781     }
 782
 783     $in->close();
 784
 785     $hash->{'_family'} = $fam;
 786 }
 787
 788 #----------------------------------------------------------------------
 789 # Read the PropertyValueAliases.txt file.  Return a two level hash
 790 # that maps property_short_name:value_short_name:value_long_name.  In
 791 # the case of the 'ccc' property, the short name is the numeric class
 792 # and the long name is "<short>|<long>".  The special key '_version'
 793 # will map to the Unicode version of the file.
 794 #
 795 # @param a filename for PropertyValueAliases.txt
 796 #
 797 # @return a hash reference.
 798 sub read_PropertyValueAliases {
 799
 800     my $filename = shift;
 801
 802     my $hash = {};         # result
 803
 804     my $in = new FileHandle($filename, 'r');
 805     die "Error: Cannot open $filename" if (!defined $in);
 806
 807     my $sym = 0; # Used to make "n/a" strings unique
 808
 809     while (<$in>) {
 810
 811         # Read version (embedded in a comment)
 812         if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
 813             die "Error: Multiple versions in $filename"
 814                 if (exists $hash->{'_version'});
 815             $hash->{'_version'} = $1;
 816         }
 817
 818         # Ignore comments and blank lines
 819         s/\#.*//;
 820         next unless (/\S/);
 821
 822         if (/^\s*(.+?)\s*;/i) {
 823             my $prop = $1;
 824             my @fields = /;\s*([^\s;]+)/g;
 825             die "Error: Wrong number of fields"
 826                 if (@fields < 2 || @fields > 3);
 827             # Make "n/a" strings unique
 828             $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a');
 829             # Squash extra fields together
 830             while (@fields > 2) {
 831                 my $f = pop @fields;
 832                 $fields[$#fields] .= '|' . $f;
 833             }
 834             addDatum($hash, $prop, @fields);
 835         }
 836
 837         else {
 838             die "Error: Can't parse $_ in $filename";
 839         }
 840     }
 841
 842     $in->close();
 843
 844     # Script Qaac (Coptic) is a special case.  Handle it here.  See UTR#24:
 845     # http://www.unicode.org/unicode/reports/tr24/
 846     $hash->{'sc'}->{'Qaac'} = 'Coptic'
 847         unless (exists $hash->{'sc'}->{'Qaac'});
 848
 849     # Add T|True and F|False -- these are values we recognize for
 850     # binary properties (NOT from PropertyValueAliases.txt).  These
 851     # are of the same form as the 'ccc' value aliases.
 852     $hash->{'binprop'}->{'0'} = 'F|False';
 853     $hash->{'binprop'}->{'1'} = 'T|True';
 854
 855     $hash;
 856 }
 857
 858 #----------------------------------------------------------------------
 859 # Read the Blocks.txt file.  Return a hash that maps the code point
 860 # range start to the block name.  The special key '_version' will map
 861 # to the Unicode version of the file.
 862 #
 863 # As of Unicode 4.0, the names in the Blocks.txt are no longer the
 864 # proper names.  The proper names are now listed in PropertyValueAliases.
 865 # They are similar but not identical.  Furthermore, 4.0 introduces
 866 # a new block name, No_Block, which is listed only in PropertyValueAliases
 867 # and not in Blocks.txt.  As a result, we handle blocks as follows:
 868 #
 869 # 1. Read Blocks.txt to map code point range start to quasi-block name.
 870 # 2. Add to Blocks.txt a synthetic No Block code point & name:
 871 #    X -> No Block
 872 # 3. Map quasi-names from Blocks.txt (including No Block) to actual
 873 #    names from PropertyValueAliases.  This occurs in
 874 #    merge_PropertyValueAliases.
 875 #
 876 # @param a filename for Blocks.txt
 877 #
 878 # @return a ref to a hash.  Keys are code points, as text, e.g.,
 879 # "1720".  Values are pseudo-block names, e.g., "Hanunoo".
 880 sub read_Blocks {
 881
 882     my $filename = shift;
 883
 884     my $hash = {};         # result
 885
 886     my $in = new FileHandle($filename, 'r');
 887     die "Error: Cannot open $filename" if (!defined $in);
 888
 889     while (<$in>) {
 890
 891         # Read version (embedded in a comment)
 892         if (/Blocks-(\d+\.\d+\.\d+)/i) {
 893             die "Error: Multiple versions in $filename"
 894                 if (exists $hash->{'_version'});
 895             $hash->{'_version'} = $1;
 896         }
 897
 898         # Ignore comments and blank lines
 899         s/\#.*//;
 900         next unless (/\S/);
 901
 902         if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) {
 903             die "Error: Duplicate range $1 in $filename"
 904                 if (exists $hash->{$1});
 905             $hash->{$1} = $2;
 906         }
 907
 908         else {
 909             die "Error: Can't parse $_ in $filename";
 910         }
 911     }
 912
 913     $in->close();
 914
 915     # Add pseudo-name for No Block
 916     $hash->{'none'} = 'No Block';
 917
 918     $hash;
 919 }
 920
 921 #----------------------------------------------------------------------
 922 # Read the uscript.h file and compile a mapping of Unicode symbols to
 923 # icu4c enum values.
 924 #
 925 # @param a filename for uscript.h
 926 #
 927 # @return a ref to a hash.  The keys of the hash are enum symbols from
 928 # uscript.h, and the values are script names.
 929 sub read_uscript {
 930
 931     my $filename = shift;
 932
 933     my $mode = '';         # state machine mode and submode
 934     my $submode = '';
 935
 936     my $last = '';         # for line folding
 937
 938     my $hash = {};         # result
 939     my $key;               # first-level key
 940
 941     my $in = new FileHandle($filename, 'r');
 942     die "Error: Cannot open $filename" if (!defined $in);
 943
 944     while (<$in>) {
 945         # Fold continued lines together
 946         if (/^(.*)\\$/) {
 947             $last = $1;
 948             next;
 949         } elsif ($last) {
 950             $_ = $last . $_;
 951             $last = '';
 952         }
 953
 954         # Exit all modes here
 955         if ($mode && $mode ne 'DEPRECATED') {
 956             if (/^\s*\}/) {
 957                 $mode = '';
 958                 next;
 959             }
 960         }
 961
 962         # Handle individual modes
 963
 964         if ($mode eq 'UScriptCode') {
 965             if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
 966                 my ($enum, $code) = ($1, $2);
 967                 die "Error: Duplicate script $enum"
 968                     if (exists $hash->{$enum});
 969                 $hash->{$enum} = $code;
 970             }
 971         }
 972
 973         elsif ($mode eq 'DEPRECATED') {
 974             if (/\s*\#ifdef/) {
 975                 die "Error: Nested #ifdef";
 976                 }
 977             elsif (/\s*\#endif/) {
 978                 $mode = '';
 979             }
 980         }
 981
 982         elsif (!$mode) {
 983             if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
 984                    /^\s*typedef\s+enum\s+(\w+)\s*$/) {
 985                 $mode = $1;
 986                 #print "Parsing $mode\n";
 987             }
 988
 989             elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
 990                 $mode = 'DEPRECATED';
 991             }
 992         }
 993     }
 994
 995     $in->close();
 996
 997     $hash;
 998 }
 999
1000 #----------------------------------------------------------------------
1001 # Read the uchar.h file and compile a mapping of Unicode symbols to
1002 # icu4c enum values.
1003 #
1004 # @param a filename for uchar.h
1005 #
1006 # @return a ref to a hash.  The keys of the hash are '_bp' for binary
1007 # properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
1008 # double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
1009 # 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
1010 # value aliases.  The values of the hash are subhashes.  The subhashes
1011 # have a key of the uchar.h enum symbol, and a value of the alias
1012 # string (as listed in PropertyValueAliases.txt).  NOTE: The alias
1013 # string is whatever alias uchar.h lists.  This may be either short or
1014 # long, depending on the specific enum.  NOTE: For blocks ('blk'), the
1015 # value is a hex code point for the start of the associated block.
1016 # NOTE: The special key _version will map to the Unicode version of
1017 # the file.
1018 sub read_uchar {
1019
1020     my $filename = shift;
1021
1022     my $mode = '';         # state machine mode and submode
1023     my $submode = '';
1024
1025     my $last = '';         # for line folding
1026
1027     my $hash = {};         # result
1028     my $key;               # first-level key
1029
1030     my $in = new FileHandle($filename, 'r');
1031     die "Error: Cannot open $filename" if (!defined $in);
1032
1033     while (<$in>) {
1034         # Fold continued lines together
1035         if (/^(.*)\\$/) {
1036             $last .= $1;
1037             next;
1038         } elsif ($last) {
1039             $_ = $last . $_;
1040             $last = '';
1041         }
1042
1043         # Exit all modes here
1044         if ($mode && $mode ne 'DEPRECATED') {
1045             if (/^\s*\}/) {
1046                 $mode = '';
1047                 next;
1048             }
1049         }
1050
1051         # Handle individual modes
1052
1053         if ($mode eq 'UProperty') {
1054             if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
1055                 if ($submode) {
1056                     addDatum($hash, $key, $1, $submode);
1057                     $submode = '';
1058                 } else {
1059                     #print "Warning: Ignoring $1\n";
1060                 }
1061             }
1062
1063             elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
1064                 die "Error: Unmatched tag $submode" if ($submode);
1065                 die "Error: Unrecognized UProperty comment: $_"
1066                     unless (exists $PROP_TYPE{$1});
1067                 $key = $PROP_TYPE{$1};
1068                 $submode = $2;
1069             }
1070         }
1071
1072         elsif ($mode eq 'UCharCategory') {
1073             if (/^\s*(U_\w+)\s*=/) {
1074                 if ($submode) {
1075                     addDatum($hash, 'gc', $1, $submode);
1076                     $submode = '';
1077                 } else {
1078                     #print "Warning: Ignoring $1\n";
1079                 }
1080             }
1081
1082             elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
1083                 die "Error: Unmatched tag $submode" if ($submode);
1084                 $submode = $1;
1085             }
1086         }
1087
1088         elsif ($mode eq 'UCharDirection') {
1089             if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
1090                 if ($submode) {
1091                     addDatum($hash, $key, $1, $submode);
1092                     $submode = '';
1093                 } else {
1094                     #print "Warning: Ignoring $1\n";
1095                 }
1096             }
1097
1098             elsif (m|/\*\*\s*([A-Z]+)\s|) {
1099                 die "Error: Unmatched tag $submode" if ($submode);
1100                 $key = 'bc';
1101                 $submode = $1;
1102             }
1103         }
1104
1105         elsif ($mode eq 'UBlockCode') {
1106             if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
1107                 addDatum($hash, 'blk', $1, $2);
1108             }
1109         }
1110
1111         elsif ($mode eq 'UEastAsianWidth') {
1112             if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
1113                 addDatum($hash, 'ea', $1, $2);
1114             }
1115         }
1116
1117         elsif ($mode eq 'UDecompositionType') {
1118             if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
1119                 addDatum($hash, 'dt', $1, $2);
1120             }
1121         }
1122
1123         elsif ($mode eq 'UJoiningType') {
1124             if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
1125                 addDatum($hash, 'jt', $1, $2);
1126             }
1127         }
1128
1129         elsif ($mode eq 'UJoiningGroup') {
1130             if (/^\s*(U_JG_(\w+))/) {
1131                 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
1132             }
1133         }
1134
1135         elsif ($mode eq 'ULineBreak') {
1136             if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
1137                 addDatum($hash, 'lb', $1, $2);
1138             }
1139         }
1140
1141         elsif ($mode eq 'UNumericType') {
1142             if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
1143                 addDatum($hash, 'nt', $1, $2);
1144             }
1145         }
1146
1147         elsif ($mode eq 'UHangulSyllableType') {
1148             if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
1149                 addDatum($hash, 'hst', $1, $2);
1150             }
1151         }
1152
1153         elsif ($mode eq 'DEPRECATED') {
1154             if (/\s*\#ifdef/) {
1155                 die "Error: Nested #ifdef";
1156                 }
1157             elsif (/\s*\#endif/) {
1158                 $mode = '';
1159             }
1160         }
1161
1162         elsif (!$mode) {
1163             if (/^\s*\#define\s+(\w+)\s+(.+)/) {
1164                 # #define $left $right
1165                 my ($left, $right) = ($1, $2);
1166
1167                 if ($left eq 'U_UNICODE_VERSION') {
1168                     my $version = $right;
1169                     $version = $1 if ($version =~ /^\"(.*)\"/);
1170                     # print "Unicode version: ", $version, "\n";
1171                     die "Error: Multiple versions in $filename"
1172                         if (defined $hash->{'_version'});
1173                     $hash->{'_version'} = $version;
1174                 }
1175
1176                 elsif ($left =~ /U_GC_(\w+?)_MASK/) {
1177                     addDatum($hash, 'gcm', $left, $1);
1178                 }
1179             }
1180
1181             elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1182                    /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1183                 $mode = $1;
1184                 #print "Parsing $mode\n";
1185             }
1186
1187             elsif (/^\s*enum\s+(\w+)\s*\{/ ||
1188                    /^\s*enum\s+(\w+)\s*$/) {
1189                 $mode = $1;
1190                 #print "Parsing $mode\n";
1191             }
1192
1193             elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1194                 $mode = 'DEPRECATED';
1195             }
1196         }
1197     }
1198
1199     $in->close();
1200
1201     # hardcode known values for the normalization quick check properties
1202     # see unorm.h for the UNormalizationCheckResult enum
1203
1204     addDatum($hash, 'NFC_QC', 'UNORM_NO',    'N');
1205     addDatum($hash, 'NFC_QC', 'UNORM_YES',   'Y');
1206     addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
1207
1208     addDatum($hash, 'NFKC_QC', 'UNORM_NO',    'N');
1209     addDatum($hash, 'NFKC_QC', 'UNORM_YES',   'Y');
1210     addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
1211
1212     # no "maybe" values for NF[K]D
1213
1214     addDatum($hash, 'NFD_QC', 'UNORM_NO',    'N');
1215     addDatum($hash, 'NFD_QC', 'UNORM_YES',   'Y');
1216
1217     addDatum($hash, 'NFKD_QC', 'UNORM_NO',    'N');
1218     addDatum($hash, 'NFKD_QC', 'UNORM_YES',   'Y');
1219
1220     $hash;
1221 }
1222
1223 #----------------------------------------------------------------------
1224 # Add a new value to a two-level hash.  That is, given a ref to
1225 # a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
1226 sub addDatum {
1227     my ($h, $k1, $k2, $v) = @_;
1228     if (exists $h->{$k1}->{$k2}) {
1229         die "Error: $k1:$k2 already set to " .
1230             $h->{$k1}->{$k2} . ", cannot set to " . $v;
1231     }
1232     $h->{$k1}->{$k2} = $v;
1233 }
1234
1235 #eof