icuSources/tools/genpname/preparse.pl

   1 #!/bin/perl -w
   2 #*******************************************************************
   3 # COPYRIGHT:
   4 # Copyright (c) 2002-2008, International Business Machines Corporation and
   5 # others. All Rights Reserved.
   6 #*******************************************************************
   7
   8 # This script reads in UCD files PropertyAliases.txt and
   9 # PropertyValueAliases.txt and correlates them with ICU enums
  10 # defined in uchar.h and uscript.h.  It then outputs a header
  11 # file which contains all names and enums.  The header is included
  12 # by the genpname tool C++ source file, which produces the actual
  13 # binary data file.
  14 #
  15 # See usage note below.
  16 #
  17 # TODO: The Property[Value]Alias.txt files state that they can support
  18 # more than 2 names per property|value.  Currently (Unicode 3.2) there
  19 # are always 1 or 2 names.  If more names were supported, presumably
  20 # the format would be something like:
  21 #    nv        ; Numeric_Value
  22 #    nv        ; Value_Numerique
  23 # CURRENTLY, this script assumes that there are 1 or two names.  Any
  24 # duplicates it sees are flagged as an error.  If multiple aliases
  25 # appear in a future version of Unicode, modify this script to support
  26 # that.
  27 #
  28 # NOTE: As of ICU 2.6, this script has been modified to know about the
  29 # pseudo-property gcm/General_Category_Mask, which corresponds to the
  30 # uchar.h property UCHAR_GENERAL_CATEGORY_MASK.  This property
  31 # corresponds to General_Category but is a bitmask value.  It does not
  32 # exist in the UCD.  Therefore, I special case it in several places
  33 # (search for General_Category_Mask and gcm).
  34 #
  35 # NOTE: As of ICU 2.6, this script reads an auxiliary data file,
  36 # SyntheticPropertyAliases.txt, containing property aliases not
  37 # present in the UCD but present in ICU.  This file resides in the
  38 # same directory as this script.  Its contents are merged into those
  39 # of PropertyAliases.txt as if the two files were appended.
  40 #
  41 # NOTE: The following names are handled specially.  See script below
  42 # for details.
  43 #
  44 #   T/True
  45 #   F/False
  46 #   No_Block
  47 #
  48 # Author: Alan Liu
  49 # Created: October 14 2002
  50 # Since: ICU 2.4
  51
  52 use FileHandle;
  53 use strict;
  54 use Dumpvalue;
  55
  56 my $DEBUG = 1;
  57 my $DUMPER = new Dumpvalue;
  58
  59 my $count = @ARGV;
  60 my $ICU_DIR = shift() || '';
  61 my $OUT_FILE = shift() || 'data.h';
  62 my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
  63 my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
  64
  65 # Get the current year from the system
  66 my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
  67
  68 # Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique
  69 my $propNA = 0;
  70 my $valueNA = 0;
  71
  72 #----------------------------------------------------------------------
  73 # Top level property keys for binary, enumerated, string, and double props
  74 my @TOP     = qw( _bp _ep _sp _dp _mp );
  75
  76 # This hash governs how top level properties are grouped into output arrays.
  77 #my %TOP_PROPS = ( "VALUED"   => [ '_bp', '_ep' ],
  78 #                  "NO_VALUE" => [ '_sp', '_dp' ] );m
  79 #my %TOP_PROPS = ( "BINARY"   => [ '_bp' ],
  80 #                  "ENUMERATED" => [ '_ep' ],
  81 #                  "STRING" => [ '_sp' ],
  82 #                  "DOUBLE" => [ '_dp' ] );
  83 my %TOP_PROPS = ( ""   => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
  84
  85 my %PROP_TYPE = (Binary => "_bp",
  86                  String => "_sp",
  87                  Double => "_dp",
  88                  Enumerated => "_ep",
  89                  Bitmask => "_mp");
  90 #----------------------------------------------------------------------
  91
  92 # Properties that are unsupported in ICU
  93 my %UNSUPPORTED = (Composition_Exclusion => 1,
  94                    Decomposition_Mapping => 1,
  95                    Expands_On_NFC => 1,
  96                    Expands_On_NFD => 1,
  97                    Expands_On_NFKC => 1,
  98                    Expands_On_NFKD => 1,
  99                    FC_NFKC_Closure => 1,
 100                    ID_Start_Exceptions => 1,
 101                    Special_Case_Condition => 1,
 102                    );
 103
 104 # Short names of properties that weren't seen in uchar.h.  If the
 105 # properties weren't seen, don't complain about the property values
 106 # missing.
 107 my %MISSING_FROM_UCHAR;
 108
 109 # Additional property aliases beyond short and long names,
 110 # like space in addition to WSpace and White_Space in Unicode 4.1.
 111 # Hashtable, maps long name to alias.
 112 # For example, maps White_Space->space.
 113 #
 114 # If multiple additional aliases are defined,
 115 # then they are separated in the value string with '|'.
 116 # For example, White_Space->space|outer_space
 117 my %additional_property_aliases;
 118
 119 #----------------------------------------------------------------------
 120
 121 # Emitted class names
 122 my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
 123
 124 if ($count < 1 || $count > 2 ||
 125     !-d $HEADER_DIR ||
 126     !-d $UNIDATA_DIR) {
 127     my $me = $0;
 128     $me =~ s|.+[/\\]||;
 129     my $lm = ' ' x length($me);
 130     print <<"END";
 131
 132 $me: Reads ICU4C headers and Unicode data files and creates
 133 $lm  a C header file that is included by genpname.  The header
 134 $lm  file matches constants defined in the ICU4C headers with
 135 $lm  property|value aliases in the Unicode data files.
 136
 137 Usage: $me <icu_dir> [<out_file>]
 138
 139 <icu_dir>   ICU4C root directory, containing
 140                source/common/unicode/uchar.h
 141                source/common/unicode/uscript.h
 142                source/data/unidata/Blocks.txt
 143                source/data/unidata/PropertyAliases.txt
 144                source/data/unidata/PropertyValueAliases.txt
 145 <out_file>  File name of header to be written;
 146             default is 'data.h'.
 147
 148 The Unicode versions of all input files must match.
 149 END
 150     exit(1);
 151 }
 152
 153 my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
 154
 155 if ($DEBUG) {
 156     print "Merged hash:\n";
 157     for my $key (sort keys %$h) {
 158         my $hh = $h->{$key};
 159         for my $subkey (sort keys %$hh) {
 160             print "$key:$subkey:", $hh->{$subkey}, "\n";
 161         }
 162     }
 163 }
 164
 165 my $out = new FileHandle($OUT_FILE, 'w');
 166 die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
 167 my $save = select($out);
 168 formatData($h, $version);
 169 select($save);
 170 $out->close();
 171
 172 exit(0);
 173
 174 #----------------------------------------------------------------------
 175 # From PropList.html: "The properties of the form Other_XXX
 176 # are used to generate properties in DerivedCoreProperties.txt.
 177 # They are not intended for general use, such as in APIs that
 178 # return property values.
 179 # Non_Break is not a valid property as of 3.2.
 180 sub isIgnoredProperty {
 181     local $_ = shift;
 182     /^Other_/i || /^Non_Break$/i;
 183 }
 184
 185 # 'qc' is a pseudo-property matching any quick-check property
 186 # see PropertyValueAliases.txt file comments.  'binprop' is
 187 # a synthetic binary value alias "True"/"False", not present
 188 # in PropertyValueAliases.txt until Unicode 5.0.
 189 # Starting with Unicode 5.1, PropertyValueAliases.txt does have
 190 # explicit values for binary properties.
 191 sub isPseudoProperty {
 192     $_[0] eq 'qc' ||
 193         $_[0] eq 'binprop';
 194 }
 195
 196 #----------------------------------------------------------------------
 197 # Emit the combined data from headers and the Unicode database as a
 198 # C source code header file.
 199 #
 200 # @param ref to hash with the data
 201 # @param Unicode version, as a string
 202 sub formatData {
 203     my $h = shift;
 204     my $version = shift;
 205
 206     my $date = scalar localtime();
 207     print <<"END";
 208 /**
 209  * Copyright (C) 2002-$YEAR, International Business Machines Corporation and
 210  * others. All Rights Reserved.
 211  *
 212  * MACHINE GENERATED FILE.  !!! Do not edit manually !!!
 213  *
 214  * Generated from
 215  *   uchar.h
 216  *   uscript.h
 217  *   Blocks.txt
 218  *   PropertyAliases.txt
 219  *   PropertyValueAliases.txt
 220  *
 221  * Date: $date
 222  * Unicode version: $version
 223  * Script: $0
 224  */
 225
 226 END
 227
 228     #------------------------------------------------------------
 229     # Emit Unicode version
 230     print "/* Unicode version $version */\n";
 231     my @v = split(/\./, $version);
 232     push @v, '0' while (@v < 4);
 233     for (my $i=0; $i<@v; ++$i) {
 234         print "const uint8_t VERSION_$i = $v[$i];\n";
 235     }
 236     print "\n";
 237
 238     #------------------------------------------------------------
 239     # Emit String table
 240     # [A table of all identifiers, that is, all long or short property
 241     # or value names.  The list need NOT be sorted; it will be sorted
 242     # by the C program.  Strings are referenced by their index into
 243     # this table.  After sorting, a REMAP[] array is used to map the
 244     # old position indices to the new positions.]
 245     my %strings;
 246     for my $prop (sort keys %$h) {
 247         my $hh = $h->{$prop};
 248         for my $enum (sort keys %$hh) {
 249             my @a = split(/\|/, $hh->{$enum});
 250             for (@a) {
 251                 $strings{$_} = 1 if (length($_));
 252             }
 253         }
 254     }
 255     my @strings = sort keys %strings;
 256     unshift @strings, "";
 257
 258     print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";
 259
 260     # while printing, create a mapping hash from string table entry to index
 261     my %stringToID;
 262     print "/* to be sorted */\n";
 263     print "const $STRING_CLASS STRING_TABLE[] = {\n";
 264     for (my $i=0; $i<@strings; ++$i) {
 265         print "    $STRING_CLASS(\"$strings[$i]\", $i),\n";
 266         $stringToID{$strings[$i]} = $i;
 267     }
 268     print "};\n\n";
 269
 270     # placeholder for the remapping index.  this is used to map
 271     # indices that we compute here to indices of the sorted
 272     # STRING_TABLE.  STRING_TABLE will be sorted by the C++ program
 273     # using the uprv_comparePropertyNames() function.  this will
 274     # reshuffle the order.  we then use the indices (passed to the
 275     # String constructor) to create a REMAP[] array.
 276     print "/* to be filled in */\n";
 277     print "int32_t REMAP[", scalar @strings, "];\n\n";
 278
 279     #------------------------------------------------------------
 280     # Emit the name group table
 281     # [A table of name groups.  A name group is one or more names
 282     # for a property or property value.  The Unicode data files specify
 283     # that there may be more than 2, although as of Unicode 3.2 there
 284     # are at most 2.  The name group table looks like this:
 285     #
 286     #  114, -115, 116, -117, 0, -118, 65, -64, ...
 287     #  [0]        [2]        [4]      [6]
 288     #
 289     # The entry at [0] consists of 2 strings, 114 and 115.
 290     # The entry at [2] consists of 116 and 117.  The entry at
 291     # [4] is one string, 118.  There is always at least one
 292     # string; typically there are two.  If there are two, the first
 293     # is the SHORT name and the second is the LONG.  If there is
 294     # one, then the missing entry (always the short name, in 3.2)
 295     # is zero, which is by definition the index of "".  The
 296     # 'preferred' name will generally be the LONG name, if there are
 297     # more than 2 entries.  The last entry is negative.
 298
 299     # Build name group list and replace string refs with nameGroup indices
 300     my @nameGroups;
 301
 302     # Check for duplicate name groups, and reuse them if possible
 303     my %groupToInt; # Map group strings to ints
 304     for my $prop (sort keys %$h) {
 305         my $hh = $h->{$prop};
 306         for my $enum (sort keys %$hh) {
 307             my $groupString = $hh->{$enum};
 308             my $i;
 309             if (exists $groupToInt{$groupString}) {
 310                 $i = $groupToInt{$groupString};
 311             } else {
 312                 my @names = split(/\|/, $groupString);
 313                 die "Error: Wrong number of names in " . $groupString if (@names < 1);
 314                 $i = @nameGroups; # index of group we are making
 315                 $groupToInt{$groupString} = $i; # Cache for reuse
 316                 push @nameGroups, map { $stringToID{$_} } @names;
 317                 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
 318             }
 319             # now, replace string list with ref to name group
 320             $hh->{$enum} = $i;
 321         }
 322     }
 323
 324     print "const int32_t NAME_GROUP_COUNT = ",
 325           scalar @nameGroups, ";\n\n";
 326
 327     print "int32_t NAME_GROUP[] = {\n";
 328     # emit one group per line, with annotations
 329     my $max_names = 0;
 330     for (my $i=0; $i<@nameGroups; ) {
 331         my @a;
 332         my $line;
 333         my $start = $i;
 334         for (;;) {
 335             my $j = $nameGroups[$i++];
 336             $line .= "$j, ";
 337             push @a, abs($j);
 338             last if ($j < 0);
 339         }
 340         print "    ",
 341               $line,
 342               ' 'x(20-length($line)),
 343               "/* ", sprintf("%3d", $start),
 344               ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
 345         $max_names = @a if(@a > $max_names);
 346
 347     }
 348     print "};\n\n";
 349
 350     # This is fixed for 3.2 at "2" but should be calculated dynamically
 351     # when more than 2 names appear in Property[Value]Aliases.txt.
 352     print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
 353
 354     #------------------------------------------------------------
 355     # Emit enumerated property values
 356     for my $prop (sort keys %$h) {
 357         next if ($prop =~ /^_/);
 358         my $vh = $h->{$prop};
 359         my $count = scalar keys %$vh;
 360
 361         print "const int32_t VALUES_${prop}_COUNT = ",
 362               $count, ";\n\n";
 363
 364         print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
 365         for my $enum (sort keys %$vh) {
 366             #my @names = split(/\|/, $vh->{$enum});
 367             #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
 368             #    if (@names != 2);
 369             print "    $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
 370                   #$stringToID{$names[0]}, ", ",
 371                   #$stringToID{$names[1]}, "),\n";
 372             #      "\"", $names[0], "\", ",
 373             #      "\"", $names[1], "\"),\n";
 374         }
 375         print "};\n\n";
 376     }
 377
 378     #------------------------------------------------------------
 379     # Emit top-level properties (binary, enumerated, etc.)
 380     for my $topName (sort keys %TOP_PROPS) {
 381         my $a = $TOP_PROPS{$topName};
 382         my $count = 0;
 383         for my $type (@$a) { # "_bp", "_ep", etc.
 384             $count += scalar keys %{$h->{$type}};
 385         }
 386
 387         print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
 388
 389         print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
 390
 391         for my $type (@$a) { # "_bp", "_ep", etc.
 392             my $p = $h->{$type};
 393
 394             for my $enum (sort keys %$p) {
 395                 my $name = $strings[$nameGroups[$p->{$enum}]];
 396
 397                 my $valueRef = "0, NULL";
 398                 if ($type eq '_bp') {
 399                     $valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
 400                 }
 401                 elsif (exists $h->{$name}) {
 402                     $valueRef = "VALUES_${name}_COUNT, VALUES_$name";
 403                 }
 404
 405                 print "    $PROPERTY_CLASS((int32_t) $enum, ",
 406                       $p->{$enum}, ", $valueRef),\n";
 407             }
 408         }
 409         print "};\n\n";
 410     }
 411
 412     print "/*eof*/\n";
 413 }
 414
 415 #----------------------------------------------------------------------
 416 # Read in the files uchar.h, uscript.h, Blocks.txt,
 417 # PropertyAliases.txt, and PropertyValueAliases.txt,
 418 # and combine them into one hash.
 419 #
 420 # @param directory containing headers
 421 # @param directory containin Unicode data files
 422 #
 423 # @return hash ref, Unicode version
 424 sub readAndMerge {
 425
 426     my ($headerDir, $unidataDir) = @_;
 427
 428     my $h = read_uchar("$headerDir/uchar.h");
 429     my $s = read_uscript("$headerDir/uscript.h");
 430     my $b = read_Blocks("$unidataDir/Blocks.txt");
 431     my $pa = {};
 432     read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
 433     read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
 434     my $va = {};
 435     read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt");
 436     read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt");
 437
 438     # Extract property family hash
 439     my $fam = $pa->{'_family'};
 440     delete $pa->{'_family'};
 441
 442     # Note: uscript.h has no version string, so don't check it
 443     my $version = check_versions([ 'uchar.h', $h ],
 444                                  [ 'Blocks.txt', $b ],
 445                                  [ 'PropertyAliases.txt', $pa ],
 446                                  [ 'PropertyValueAliases.txt', $va ]);
 447
 448     # Do this BEFORE merging; merging modifies the hashes
 449     check_PropertyValueAliases($pa, $va);
 450
 451     # Dump out the $va hash for debugging
 452     if ($DEBUG) {
 453         print "Property values hash:\n";
 454         for my $key (sort keys %$va) {
 455             my $hh = $va->{$key};
 456             for my $subkey (sort keys %$hh) {
 457                 print "$key:$subkey:", $hh->{$subkey}, "\n";
 458             }
 459         }
 460     }
 461
 462     # Dump out the $s hash for debugging
 463     if ($DEBUG) {
 464         print "Script hash:\n";
 465         for my $key (sort keys %$s) {
 466             print "$key:", $s->{$key}, "\n";
 467         }
 468     }
 469
 470     # Link in the script data
 471     $h->{'sc'} = $s;
 472
 473     merge_Blocks($h, $b);
 474
 475     merge_PropertyAliases($h, $pa, $fam);
 476
 477     merge_PropertyValueAliases($h, $va);
 478
 479     ($h, $version);
 480 }
 481
 482 #----------------------------------------------------------------------
 483 # Ensure that the version strings in the given hashes (under the key
 484 # '_version') are compatible.  Currently this means they must be
 485 # identical, with the exception that "X.Y" will match "X.Y.0".
 486 # All hashes must define the key '_version'.
 487 #
 488 # @param a list of pairs of (file name, hash reference)
 489 #
 490 # @return the version of all the hashes.  Upon return, the '_version'
 491 # will be removed from all hashes.
 492 sub check_versions {
 493     my $version = '';
 494     my $msg = '';
 495     foreach my $a (@_) {
 496         my $name = $a->[0];
 497         my $h    = $a->[1];
 498         die "Error: No version found" unless (exists $h->{'_version'});
 499         my $v = $h->{'_version'};
 500         delete $h->{'_version'};
 501
 502         # append ".0" if necessary, to standardize to X.Y.Z
 503         $v .= '.0' unless ($v =~ /\.\d+\./);
 504         $v .= '.0' unless ($v =~ /\.\d+\./);
 505         $msg .= "$name = $v\n";
 506         if ($version) {
 507             die "Error: Mismatched Unicode versions\n$msg"
 508                 unless ($version eq $v);
 509         } else {
 510             $version = $v;
 511         }
 512     }
 513     $version;
 514 }
 515
 516 #----------------------------------------------------------------------
 517 # Make sure the property names in PropertyValueAliases.txt match those
 518 # in PropertyAliases.txt.
 519 #
 520 # @param a hash ref from read_PropertyAliases.
 521 # @param a hash ref from read_PropertyValueAliases.
 522 sub check_PropertyValueAliases {
 523     my ($pa, $va) = @_;
 524
 525     # make a reverse hash of short->long
 526     my %rev;
 527     for (keys %$pa) { $rev{$pa->{$_}} = $_; }
 528
 529     for my $prop (keys %$va) {
 530         if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
 531             print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
 532         }
 533     }
 534 }
 535
 536 #----------------------------------------------------------------------
 537 # Merge blocks data into uchar.h enum data.  In the 'blk' subhash all
 538 # code point values, as returned from read_uchar, are replaced by
 539 # block names, as read from Blocks.txt and returned by read_Blocks.
 540 # The match must be 1-to-1.  If there is any failure of 1-to-1
 541 # mapping, an error is signaled.  Upon return, the read_Blocks hash
 542 # is emptied of all contents, except for those that failed to match.
 543 #
 544 # The mapping in the 'blk' subhash, after this function returns, is
 545 # from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
 546 # pseudo-name, e.g. "Basic Latin".
 547 #
 548 # @param a hash ref from read_uchar.
 549 # @param a hash ref from read_Blocks.
 550 sub merge_Blocks {
 551     my ($h, $b) = @_;
 552
 553     die "Error: No blocks data in uchar.h"
 554         unless (exists $h->{'blk'});
 555     my $blk = $h->{'blk'};
 556     for my $enum (keys %$blk) {
 557         my $cp = $blk->{$enum};
 558         if ($cp && !exists $b->{$cp}) {
 559             die "Error: No block found at $cp in Blocks.txt";
 560         }
 561         # Convert code point to pseudo-name:
 562         $blk->{$enum} = $b->{$cp};
 563         delete $b->{$cp};
 564     }
 565     my $err = '';
 566     for my $cp (keys %$b) {
 567         $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
 568     }
 569     die $err if ($err);
 570 }
 571
 572 #----------------------------------------------------------------------
 573 # Merge property alias names into the uchar.h hash.  The subhashes
 574 # under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
 575 # examined and the values of those subhashes are assumed to be long
 576 # names in PropertyAliases.txt.  They are validated and replaced by
 577 # "<short>|<long>".  Upon return, the read_PropertyAliases hash is
 578 # emptied of all contents, except for those that failed to match.
 579 # Unmatched names in PropertyAliases are listed as a warning but do
 580 # NOT cause the script to die.
 581 #
 582 # @param a hash ref from read_uchar.
 583 # @param a hash ref from read_PropertyAliases.
 584 # @param a hash mapping long names to property family (e.g., 'binary')
 585 sub merge_PropertyAliases {
 586     my ($h, $pa, $fam) = @_;
 587
 588     for my $k (@TOP) {
 589         die "Error: No properties data for $k in uchar.h"
 590             unless (exists $h->{$k});
 591     }
 592
 593     for my $subh (map { $h->{$_} } @TOP) {
 594         for my $enum (keys %$subh) {
 595             my $long_name = $subh->{$enum};
 596             if (!exists $pa->{$long_name}) {
 597                 die "Error: Property $long_name not found (or used more than once)";
 598             }
 599
 600             my $value;
 601             if($pa->{$long_name} =~ m|^n/a\d*$|) {
 602                 # replace an "n/a" short name with an empty name (nothing before "|");
 603                 # don't remove it (don't remove the "|"): there must always be a long name,
 604                 # and if the short name is removed, then the long name becomes the
 605                 # short name and there is no long name left (unless there is another alias)
 606                 $value = "|" . $long_name;
 607             } else {
 608                 $value = $pa->{$long_name} . "|" . $long_name;
 609             }
 610             if (exists $additional_property_aliases{$long_name}) {
 611                 $value .= "|" . $additional_property_aliases{$long_name};
 612             }
 613             $subh->{$enum} = $value;
 614             delete $pa->{$long_name};
 615         }
 616     }
 617
 618     my @err;
 619     for my $name (keys %$pa) {
 620         $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
 621         if (exists $UNSUPPORTED{$name}) {
 622             push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
 623         } elsif (!isIgnoredProperty($name)) {
 624             push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
 625         }
 626     }
 627     print join("\n", sort @err), "\n" if (@err);
 628 }
 629
 630 #----------------------------------------------------------------------
 631 # Return 1 if two names match ignoring whitespace, '-', and '_'.
 632 # Used to match names in Blocks.txt with those in PropertyValueAliases.txt
 633 # as of Unicode 4.0.
 634 sub matchesLoosely {
 635     my ($a, $b) = @_;
 636     $a =~ s/[\s\-_]//g;
 637     $b =~ s/[\s\-_]//g;
 638     $a =~ /^$b$/i;
 639 }
 640
 641 #----------------------------------------------------------------------
 642 # Merge PropertyValueAliases.txt data into the uchar.h hash.  All
 643 # properties other than blk, _bp, and _ep are analyzed and mapped to
 644 # the names listed in PropertyValueAliases.  They are then replaced
 645 # with a string of the form "<short>|<long>".  The short or long name
 646 # may be missing.
 647 #
 648 # @param a hash ref from read_uchar.
 649 # @param a hash ref from read_PropertyValueAliases.
 650 sub merge_PropertyValueAliases {
 651     my ($h, $va) = @_;
 652
 653     my %gcCount;
 654     for my $prop (keys %$h) {
 655         # _bp, _ep handled in merge_PropertyAliases
 656         next if ($prop =~ /^_/);
 657
 658         # Special case: gcm
 659         my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
 660
 661         # find corresponding PropertyValueAliases data
 662         die "Error: Can't find $prop in PropertyValueAliases.txt"
 663             unless (exists $va->{$prop2});
 664         my $pva = $va->{$prop2};
 665
 666         # match up data
 667         my $hh = $h->{$prop};
 668         for my $enum (keys %$hh) {
 669
 670             my $name = $hh->{$enum};
 671
 672             # look up both long and short & ignore case
 673             my $n;
 674             if (exists $pva->{$name}) {
 675                 $n = $name;
 676             } else {
 677                 # iterate (slow)
 678                 for my $a (keys %$pva) {
 679                     # case-insensitive match
 680                     # & case-insensitive reverse match
 681                     if ($a =~ /^$name$/i ||
 682                         $pva->{$a} =~ /^$name$/i) {
 683                         $n = $a;
 684                         last;
 685                     }
 686                 }
 687             }
 688
 689             # For blocks, do a loose match from Blocks.txt pseudo-name
 690             # to PropertyValueAliases long name.
 691             if (!$n && $prop eq 'blk') {
 692                 for my $a (keys %$pva) {
 693                     # The block is only going to match the long name,
 694                     # but we check both for completeness.  As of Unicode
 695                     # 4.0, blocks do not have short names.
 696                     if (matchesLoosely($name, $pva->{$a}) ||
 697                         matchesLoosely($name, $a)) {
 698                         $n = $a;
 699                         last;
 700                     }
 701                 }
 702             }
 703
 704             die "Error: Property value $prop:$name not found" unless ($n);
 705
 706             my $l = $n;
 707             my $r = $pva->{$n};
 708             # convert |n/a\d*| to blank
 709             $l = '' if ($l =~ m|^n/a\d*$|);
 710             $r = '' if ($r =~ m|^n/a\d*$|);
 711
 712             $hh->{$enum} = "$l|$r";
 713             # Don't delete the 'gc' properties because we need to share
 714             # them between 'gc' and 'gcm'.  Count each use instead.
 715             if ($prop2 eq 'gc') {
 716                 ++$gcCount{$n};
 717             } else {
 718                 delete $pva->{$n};
 719             }
 720         }
 721     }
 722
 723     # Merge the combining class values in manually
 724     # Add the same values to the synthetic lccc and tccc properties
 725     die "Error: No ccc data"
 726         unless exists $va->{'ccc'};
 727     for my $ccc (keys %{$va->{'ccc'}}) {
 728         die "Error: Can't overwrite ccc $ccc"
 729             if (exists $h->{'ccc'}->{$ccc});
 730         $h->{'lccc'}->{$ccc} =
 731         $h->{'tccc'}->{$ccc} =
 732         $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
 733     }
 734     delete $va->{'ccc'};
 735
 736     # Merge synthetic binary property values in manually.
 737     # These are the "True" and "False" value aliases.
 738     die "Error: No True/False value aliases"
 739         unless exists $va->{'binprop'};
 740     for my $bp (keys %{$va->{'binprop'}}) {
 741         $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
 742     }
 743     delete $va->{'binprop'};
 744
 745     my $err = '';
 746     for my $prop (sort keys %$va) {
 747         my $hh = $va->{$prop};
 748         for my $subkey (sort keys %$hh) {
 749             # 'gc' props are shared with 'gcm'; make sure they were used
 750             # once or twice.
 751             if ($prop eq 'gc') {
 752                 my $n = $gcCount{$subkey};
 753                 next if ($n >= 1 && $n <= 2);
 754             }
 755             $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
 756                 unless exists $MISSING_FROM_UCHAR{$prop};
 757         }
 758     }
 759     print $err if ($err);
 760 }
 761
 762 #----------------------------------------------------------------------
 763 # Read the PropertyAliases.txt file.  Return a hash that maps the long
 764 # name to the short name.  The special key '_version' will map to the
 765 # Unicode version of the file.  The special key '_family' holds a
 766 # subhash that maps long names to a family string, for descriptive
 767 # purposes.
 768 #
 769 # @param a filename for PropertyAliases.txt
 770 # @param reference to hash to receive data.  Keys are long names.
 771 # Values are short names.
 772 sub read_PropertyAliases {
 773
 774     my $hash = shift;         # result
 775
 776     my $filename = shift;
 777
 778     my $fam = {};  # map long names to family string
 779     $fam = $hash->{'_family'} if (exists $hash->{'_family'});
 780
 781     my $family; # binary, enumerated, etc.
 782
 783     my $in = new FileHandle($filename, 'r');
 784     die "Error: Cannot open $filename" if (!defined $in);
 785
 786     while (<$in>) {
 787
 788         # Read version (embedded in a comment)
 789         if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
 790             die "Error: Multiple versions in $filename"
 791                 if (exists $hash->{'_version'});
 792             $hash->{'_version'} = $1;
 793         }
 794
 795         # Read family heading
 796         if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
 797             $family = $1;
 798         }
 799
 800         # Ignore comments and blank lines
 801         s/\#.*//;
 802         next unless (/\S/);
 803
 804         if (/^\s*(.+?)\s*;/) {
 805             my $short = $1;
 806             my @fields = /;\s*([^\s;]+)/g;
 807             if (@fields < 1 || @fields > 2) {
 808                 my $number = @fields;
 809                 die "Error: Wrong number of fields ($number) in $filename at $_";
 810             }
 811
 812             # Make "n/a" strings unique
 813             if ($short eq 'n/a') {
 814                 $short .= sprintf("%03d", $propNA++);
 815             }
 816             my $long = $fields[0];
 817             if ($long eq 'n/a') {
 818                 $long .= sprintf("%03d", $propNA++);
 819             }
 820
 821             # Add long name->short name to the hash=pa hash table
 822             if (exists $hash->{$long}) {
 823                 die "Error: Duplicate property $long in $filename"
 824             }
 825             $hash->{$long} = $short;
 826             $fam->{$long} = $family;
 827
 828             # Add the list of further aliases to the additional_property_aliases hash table,
 829             # using the long property name as the key.
 830             # For example:
 831             #   White_Space->space|outer_space
 832             if (@fields > 1) {
 833                 my $value = pop @fields;
 834                 while (@fields > 1) {
 835                     $value .= "|" . pop @fields;
 836                 }
 837                 $additional_property_aliases{$long} = $value;
 838             }
 839         } else {
 840             die "Error: Can't parse $_ in $filename";
 841         }
 842     }
 843
 844     $in->close();
 845
 846     $hash->{'_family'} = $fam;
 847 }
 848
 849 #----------------------------------------------------------------------
 850 # Read the PropertyValueAliases.txt file.  Return a two level hash
 851 # that maps property_short_name:value_short_name:value_long_name.  In
 852 # the case of the 'ccc' property, the short name is the numeric class
 853 # and the long name is "<short>|<long>".  The special key '_version'
 854 # will map to the Unicode version of the file.
 855 #
 856 # @param a filename for PropertyValueAliases.txt
 857 #
 858 # @return a hash reference.
 859 sub read_PropertyValueAliases {
 860
 861     my $hash = shift;         # result
 862
 863     my $filename = shift;
 864
 865     my $in = new FileHandle($filename, 'r');
 866     die "Error: Cannot open $filename" if (!defined $in);
 867
 868     while (<$in>) {
 869
 870         # Read version (embedded in a comment)
 871         if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
 872             die "Error: Multiple versions in $filename"
 873                 if (exists $hash->{'_version'});
 874             $hash->{'_version'} = $1;
 875         }
 876
 877         # Ignore comments and blank lines
 878         s/\#.*//;
 879         next unless (/\S/);
 880
 881         if (/^\s*(.+?)\s*;/i) {
 882             my $prop = $1;
 883             my @fields = /;\s*([^\s;]+)/g;
 884             die "Error: Wrong number of fields in $filename"
 885                 if (@fields < 2 || @fields > 5);
 886             # Make "n/a" strings unique
 887             $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
 888             # Squash extra fields together
 889             while (@fields > 2) {
 890                 my $f = pop @fields;
 891                 $fields[$#fields] .= '|' . $f;
 892             }
 893             addDatum($hash, $prop, @fields);
 894         }
 895
 896         else {
 897             die "Error: Can't parse $_ in $filename";
 898         }
 899     }
 900
 901     $in->close();
 902
 903     # Script Copt=Qaac (Coptic) is a special case.
 904     # Before the Copt code was defined, the private-use code Qaac was used.
 905     # Starting with Unicode 4.1, PropertyValueAliases.txt contains
 906     # Copt as the short name as well as Qaac as an alias.
 907     # For use with older Unicode data files, we add here a Qaac->Coptic entry.
 908     # This should not do anything for 4.1-and-later Unicode data files.
 909     # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/
 910     $hash->{'sc'}->{'Qaac'} = 'Coptic'
 911         unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'});
 912
 913     # Add N|No|T|True and Y|Yes|F|False -- these are values we recognize for
 914     # binary properties (until Unicode 5.0 NOT from PropertyValueAliases.txt).
 915     # These are of the same form as the 'ccc' value aliases.
 916     # Starting with Unicode 5.1, PropertyValueAliases.txt does have values
 917     # for binary properties.
 918     if (!exists $hash->{'binprop'}->{'0'}) {
 919         if (exists $hash->{'Alpha'}->{'N'}) {
 920             # Unicode 5.1 and later: Make the numeric value the key.
 921             $hash->{'binprop'}->{'0'} = 'N|' . $hash->{'Alpha'}->{'N'};
 922             $hash->{'binprop'}->{'1'} = 'Y|' . $hash->{'Alpha'}->{'Y'};
 923         } elsif (exists $hash->{'Alpha'}) {
 924             die "Error: Unrecognized short value name for binary property 'Alpha'\n";
 925         } else {
 926             # Unicode 5.0 and earlier: Add manually.
 927             $hash->{'binprop'}->{'0'} = 'N|No|F|False';
 928             $hash->{'binprop'}->{'1'} = 'Y|Yes|T|True';
 929         }
 930     }
 931 }
 932
 933 #----------------------------------------------------------------------
 934 # Read the Blocks.txt file.  Return a hash that maps the code point
 935 # range start to the block name.  The special key '_version' will map
 936 # to the Unicode version of the file.
 937 #
 938 # As of Unicode 4.0, the names in the Blocks.txt are no longer the
 939 # proper names.  The proper names are now listed in PropertyValueAliases.
 940 # They are similar but not identical.  Furthermore, 4.0 introduces
 941 # a new block name, No_Block, which is listed only in PropertyValueAliases
 942 # and not in Blocks.txt.  As a result, we handle blocks as follows:
 943 #
 944 # 1. Read Blocks.txt to map code point range start to quasi-block name.
 945 # 2. Add to Blocks.txt a synthetic No Block code point & name:
 946 #    X -> No Block
 947 # 3. Map quasi-names from Blocks.txt (including No Block) to actual
 948 #    names from PropertyValueAliases.  This occurs in
 949 #    merge_PropertyValueAliases.
 950 #
 951 # @param a filename for Blocks.txt
 952 #
 953 # @return a ref to a hash.  Keys are code points, as text, e.g.,
 954 # "1720".  Values are pseudo-block names, e.g., "Hanunoo".
 955 sub read_Blocks {
 956
 957     my $filename = shift;
 958
 959     my $hash = {};         # result
 960
 961     my $in = new FileHandle($filename, 'r');
 962     die "Error: Cannot open $filename" if (!defined $in);
 963
 964     while (<$in>) {
 965
 966         # Read version (embedded in a comment)
 967         if (/Blocks-(\d+\.\d+\.\d+)/i) {
 968             die "Error: Multiple versions in $filename"
 969                 if (exists $hash->{'_version'});
 970             $hash->{'_version'} = $1;
 971         }
 972
 973         # Ignore comments and blank lines
 974         s/\#.*//;
 975         next unless (/\S/);
 976
 977         if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) {
 978             die "Error: Duplicate range $1 in $filename"
 979                 if (exists $hash->{$1});
 980             $hash->{$1} = $2;
 981         }
 982
 983         else {
 984             die "Error: Can't parse $_ in $filename";
 985         }
 986     }
 987
 988     $in->close();
 989
 990     # Add pseudo-name for No Block
 991     $hash->{'none'} = 'No Block';
 992
 993     $hash;
 994 }
 995
 996 #----------------------------------------------------------------------
 997 # Read the uscript.h file and compile a mapping of Unicode symbols to
 998 # icu4c enum values.
 999 #
1000 # @param a filename for uscript.h
1001 #
1002 # @return a ref to a hash.  The keys of the hash are enum symbols from
1003 # uscript.h, and the values are script names.
1004 sub read_uscript {
1005
1006     my $filename = shift;
1007
1008     my $mode = '';         # state machine mode and submode
1009     my $submode = '';
1010
1011     my $last = '';         # for line folding
1012
1013     my $hash = {};         # result
1014     my $key;               # first-level key
1015
1016     my $in = new FileHandle($filename, 'r');
1017     die "Error: Cannot open $filename" if (!defined $in);
1018
1019     while (<$in>) {
1020         # Fold continued lines together
1021         if (/^(.*)\\$/) {
1022             $last = $1;
1023             next;
1024         } elsif ($last) {
1025             $_ = $last . $_;
1026             $last = '';
1027         }
1028
1029         # Exit all modes here
1030         if ($mode && $mode ne 'DEPRECATED') {
1031             if (/^\s*\}/) {
1032                 $mode = '';
1033                 next;
1034             }
1035         }
1036
1037         # Handle individual modes
1038
1039         if ($mode eq 'UScriptCode') {
1040             if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
1041                 my ($enum, $code) = ($1, $2);
1042                 die "Error: Duplicate script $enum"
1043                     if (exists $hash->{$enum});
1044                 $hash->{$enum} = $code;
1045             }
1046         }
1047
1048         elsif ($mode eq 'DEPRECATED') {
1049             if (/\s*\#ifdef/) {
1050                 die "Error: Nested #ifdef";
1051                 }
1052             elsif (/\s*\#endif/) {
1053                 $mode = '';
1054             }
1055         }
1056
1057         elsif (!$mode) {
1058             if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1059                    /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1060                 $mode = $1;
1061                 #print "Parsing $mode\n";
1062             }
1063
1064             elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1065                 $mode = 'DEPRECATED';
1066             }
1067         }
1068     }
1069
1070     $in->close();
1071
1072     $hash;
1073 }
1074
1075 #----------------------------------------------------------------------
1076 # Read the uchar.h file and compile a mapping of Unicode symbols to
1077 # icu4c enum values.
1078 #
1079 # @param a filename for uchar.h
1080 #
1081 # @return a ref to a hash.  The keys of the hash are '_bp' for binary
1082 # properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
1083 # double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
1084 # 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
1085 # value aliases.  The values of the hash are subhashes.  The subhashes
1086 # have a key of the uchar.h enum symbol, and a value of the alias
1087 # string (as listed in PropertyValueAliases.txt).  NOTE: The alias
1088 # string is whatever alias uchar.h lists.  This may be either short or
1089 # long, depending on the specific enum.  NOTE: For blocks ('blk'), the
1090 # value is a hex code point for the start of the associated block.
1091 # NOTE: The special key _version will map to the Unicode version of
1092 # the file.
1093 sub read_uchar {
1094
1095     my $filename = shift;
1096
1097     my $mode = '';         # state machine mode and submode
1098     my $submode = '';
1099
1100     my $last = '';         # for line folding
1101
1102     my $hash = {};         # result
1103     my $key;               # first-level key
1104
1105     my $in = new FileHandle($filename, 'r');
1106     die "Error: Cannot open $filename" if (!defined $in);
1107
1108     while (<$in>) {
1109         # Fold continued lines together
1110         if (/^(.*)\\$/) {
1111             $last .= $1;
1112             next;
1113         } elsif ($last) {
1114             $_ = $last . $_;
1115             $last = '';
1116         }
1117
1118         # Exit all modes here
1119         if ($mode && $mode ne 'DEPRECATED') {
1120             if (/^\s*\}/) {
1121                 $mode = '';
1122                 next;
1123             }
1124         }
1125
1126         # Handle individual modes
1127
1128         if ($mode eq 'UProperty') {
1129             if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
1130                 if ($submode) {
1131                     addDatum($hash, $key, $1, $submode);
1132                     $submode = '';
1133                 } else {
1134                     #print "Warning: Ignoring $1\n";
1135                 }
1136             }
1137
1138             elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
1139                 die "Error: Unmatched tag $submode" if ($submode);
1140                 die "Error: Unrecognized UProperty comment: $_"
1141                     unless (exists $PROP_TYPE{$1});
1142                 $key = $PROP_TYPE{$1};
1143                 $submode = $2;
1144             }
1145         }
1146
1147         elsif ($mode eq 'UCharCategory') {
1148             if (/^\s*(U_\w+)\s*=/) {
1149                 if ($submode) {
1150                     addDatum($hash, 'gc', $1, $submode);
1151                     $submode = '';
1152                 } else {
1153                     #print "Warning: Ignoring $1\n";
1154                 }
1155             }
1156
1157             elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
1158                 die "Error: Unmatched tag $submode" if ($submode);
1159                 $submode = $1;
1160             }
1161         }
1162
1163         elsif ($mode eq 'UCharDirection') {
1164             if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
1165                 if ($submode) {
1166                     addDatum($hash, $key, $1, $submode);
1167                     $submode = '';
1168                 } else {
1169                     #print "Warning: Ignoring $1\n";
1170                 }
1171             }
1172
1173             elsif (m|/\*\*\s*([A-Z]+)\s|) {
1174                 die "Error: Unmatched tag $submode" if ($submode);
1175                 $key = 'bc';
1176                 $submode = $1;
1177             }
1178         }
1179
1180         elsif ($mode eq 'UBlockCode') {
1181             if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
1182                 addDatum($hash, 'blk', $1, $2);
1183             }
1184         }
1185
1186         elsif ($mode eq 'UEastAsianWidth') {
1187             if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
1188                 addDatum($hash, 'ea', $1, $2);
1189             }
1190         }
1191
1192         elsif ($mode eq 'UDecompositionType') {
1193             if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
1194                 addDatum($hash, 'dt', $1, $2);
1195             }
1196         }
1197
1198         elsif ($mode eq 'UJoiningType') {
1199             if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
1200                 addDatum($hash, 'jt', $1, $2);
1201             }
1202         }
1203
1204         elsif ($mode eq 'UJoiningGroup') {
1205             if (/^\s*(U_JG_(\w+))/) {
1206                 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
1207             }
1208         }
1209
1210         elsif ($mode eq 'UGraphemeClusterBreak') {
1211             if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) {
1212                 addDatum($hash, 'GCB', $1, $2);
1213             }
1214         }
1215
1216         elsif ($mode eq 'UWordBreakValues') {
1217             if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) {
1218                 addDatum($hash, 'WB', $1, $2);
1219             }
1220         }
1221
1222         elsif ($mode eq 'USentenceBreak') {
1223             if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) {
1224                 addDatum($hash, 'SB', $1, $2);
1225             }
1226         }
1227
1228         elsif ($mode eq 'ULineBreak') {
1229             if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
1230                 addDatum($hash, 'lb', $1, $2);
1231             }
1232         }
1233
1234         elsif ($mode eq 'UNumericType') {
1235             if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
1236                 addDatum($hash, 'nt', $1, $2);
1237             }
1238         }
1239
1240         elsif ($mode eq 'UHangulSyllableType') {
1241             if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
1242                 addDatum($hash, 'hst', $1, $2);
1243             }
1244         }
1245
1246         elsif ($mode eq 'DEPRECATED') {
1247             if (/\s*\#ifdef/) {
1248                 die "Error: Nested #ifdef";
1249                 }
1250             elsif (/\s*\#endif/) {
1251                 $mode = '';
1252             }
1253         }
1254
1255         elsif (!$mode) {
1256             if (/^\s*\#define\s+(\w+)\s+(.+)/) {
1257                 # #define $left $right
1258                 my ($left, $right) = ($1, $2);
1259
1260                 if ($left eq 'U_UNICODE_VERSION') {
1261                     my $version = $right;
1262                     $version = $1 if ($version =~ /^\"(.*)\"/);
1263                     # print "Unicode version: ", $version, "\n";
1264                     die "Error: Multiple versions in $filename"
1265                         if (defined $hash->{'_version'});
1266                     $hash->{'_version'} = $version;
1267                 }
1268
1269                 elsif ($left =~ /U_GC_(\w+?)_MASK/) {
1270                     addDatum($hash, 'gcm', $left, $1);
1271                 }
1272             }
1273
1274             elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1275                    /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1276                 $mode = $1;
1277                 #print "Parsing $mode\n";
1278             }
1279
1280             elsif (/^\s*enum\s+(\w+)\s*\{/ ||
1281                    /^\s*enum\s+(\w+)\s*$/) {
1282                 $mode = $1;
1283                 #print "Parsing $mode\n";
1284             }
1285
1286             elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1287                 $mode = 'DEPRECATED';
1288             }
1289         }
1290     }
1291
1292     $in->close();
1293
1294     # hardcode known values for the normalization quick check properties
1295     # see unorm.h for the UNormalizationCheckResult enum
1296
1297     addDatum($hash, 'NFC_QC', 'UNORM_NO',    'N');
1298     addDatum($hash, 'NFC_QC', 'UNORM_YES',   'Y');
1299     addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
1300
1301     addDatum($hash, 'NFKC_QC', 'UNORM_NO',    'N');
1302     addDatum($hash, 'NFKC_QC', 'UNORM_YES',   'Y');
1303     addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
1304
1305     # no "maybe" values for NF[K]D
1306
1307     addDatum($hash, 'NFD_QC', 'UNORM_NO',    'N');
1308     addDatum($hash, 'NFD_QC', 'UNORM_YES',   'Y');
1309
1310     addDatum($hash, 'NFKD_QC', 'UNORM_NO',    'N');
1311     addDatum($hash, 'NFKD_QC', 'UNORM_YES',   'Y');
1312
1313     $hash;
1314 }
1315
1316 #----------------------------------------------------------------------
1317 # Add a new value to a two-level hash.  That is, given a ref to
1318 # a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
1319 sub addDatum {
1320     my ($h, $k1, $k2, $v) = @_;
1321     if (exists $h->{$k1}->{$k2}) {
1322         die "Error: $k1:$k2 already set to " .
1323             $h->{$k1}->{$k2} . ", cannot set to " . $v;
1324     }
1325     $h->{$k1}->{$k2} = $v;
1326 }
1327
1328 #eof