]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genpname/preparse.pl
ICU-400.40.tar.gz
[apple/icu.git] / icuSources / tools / genpname / preparse.pl
1 #!/bin/perl -w
2 #*******************************************************************
3 # COPYRIGHT:
4 # Copyright (c) 2002-2008, International Business Machines Corporation and
5 # others. All Rights Reserved.
6 #*******************************************************************
7
8 # This script reads in UCD files PropertyAliases.txt and
9 # PropertyValueAliases.txt and correlates them with ICU enums
10 # defined in uchar.h and uscript.h. It then outputs a header
11 # file which contains all names and enums. The header is included
12 # by the genpname tool C++ source file, which produces the actual
13 # binary data file.
14 #
15 # See usage note below.
16 #
17 # TODO: The Property[Value]Alias.txt files state that they can support
18 # more than 2 names per property|value. Currently (Unicode 3.2) there
19 # are always 1 or 2 names. If more names were supported, presumably
20 # the format would be something like:
21 # nv ; Numeric_Value
22 # nv ; Value_Numerique
23 # CURRENTLY, this script assumes that there are 1 or two names. Any
24 # duplicates it sees are flagged as an error. If multiple aliases
25 # appear in a future version of Unicode, modify this script to support
26 # that.
27 #
28 # NOTE: As of ICU 2.6, this script has been modified to know about the
29 # pseudo-property gcm/General_Category_Mask, which corresponds to the
30 # uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property
31 # corresponds to General_Category but is a bitmask value. It does not
32 # exist in the UCD. Therefore, I special case it in several places
33 # (search for General_Category_Mask and gcm).
34 #
35 # NOTE: As of ICU 2.6, this script reads an auxiliary data file,
36 # SyntheticPropertyAliases.txt, containing property aliases not
37 # present in the UCD but present in ICU. This file resides in the
38 # same directory as this script. Its contents are merged into those
39 # of PropertyAliases.txt as if the two files were appended.
40 #
41 # NOTE: The following names are handled specially. See script below
42 # for details.
43 #
44 # T/True
45 # F/False
46 # No_Block
47 #
48 # Author: Alan Liu
49 # Created: October 14 2002
50 # Since: ICU 2.4
51
52 use FileHandle;
53 use strict;
54 use Dumpvalue;
55
56 my $DEBUG = 1;
57 my $DUMPER = new Dumpvalue;
58
59 my $count = @ARGV;
60 my $ICU_DIR = shift() || '';
61 my $OUT_FILE = shift() || 'data.h';
62 my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
63 my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
64
65 # Get the current year from the system
66 my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
67
68 # Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique
69 my $propNA = 0;
70 my $valueNA = 0;
71
72 #----------------------------------------------------------------------
73 # Top level property keys for binary, enumerated, string, and double props
74 my @TOP = qw( _bp _ep _sp _dp _mp );
75
76 # This hash governs how top level properties are grouped into output arrays.
77 #my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ],
78 # "NO_VALUE" => [ '_sp', '_dp' ] );m
79 #my %TOP_PROPS = ( "BINARY" => [ '_bp' ],
80 # "ENUMERATED" => [ '_ep' ],
81 # "STRING" => [ '_sp' ],
82 # "DOUBLE" => [ '_dp' ] );
83 my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
84
85 my %PROP_TYPE = (Binary => "_bp",
86 String => "_sp",
87 Double => "_dp",
88 Enumerated => "_ep",
89 Bitmask => "_mp");
90 #----------------------------------------------------------------------
91
92 # Properties that are unsupported in ICU
93 my %UNSUPPORTED = (Composition_Exclusion => 1,
94 Decomposition_Mapping => 1,
95 Expands_On_NFC => 1,
96 Expands_On_NFD => 1,
97 Expands_On_NFKC => 1,
98 Expands_On_NFKD => 1,
99 FC_NFKC_Closure => 1,
100 ID_Start_Exceptions => 1,
101 Special_Case_Condition => 1,
102 );
103
104 # Short names of properties that weren't seen in uchar.h. If the
105 # properties weren't seen, don't complain about the property values
106 # missing.
107 my %MISSING_FROM_UCHAR;
108
109 # Additional property aliases beyond short and long names,
110 # like space in addition to WSpace and White_Space in Unicode 4.1.
111 # Hashtable, maps long name to alias.
112 # For example, maps White_Space->space.
113 #
114 # If multiple additional aliases are defined,
115 # then they are separated in the value string with '|'.
116 # For example, White_Space->space|outer_space
117 my %additional_property_aliases;
118
119 #----------------------------------------------------------------------
120
121 # Emitted class names
122 my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
123
124 if ($count < 1 || $count > 2 ||
125 !-d $HEADER_DIR ||
126 !-d $UNIDATA_DIR) {
127 my $me = $0;
128 $me =~ s|.+[/\\]||;
129 my $lm = ' ' x length($me);
130 print <<"END";
131
132 $me: Reads ICU4C headers and Unicode data files and creates
133 $lm a C header file that is included by genpname. The header
134 $lm file matches constants defined in the ICU4C headers with
135 $lm property|value aliases in the Unicode data files.
136
137 Usage: $me <icu_dir> [<out_file>]
138
139 <icu_dir> ICU4C root directory, containing
140 source/common/unicode/uchar.h
141 source/common/unicode/uscript.h
142 source/data/unidata/Blocks.txt
143 source/data/unidata/PropertyAliases.txt
144 source/data/unidata/PropertyValueAliases.txt
145 <out_file> File name of header to be written;
146 default is 'data.h'.
147
148 The Unicode versions of all input files must match.
149 END
150 exit(1);
151 }
152
153 my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
154
155 if ($DEBUG) {
156 print "Merged hash:\n";
157 for my $key (sort keys %$h) {
158 my $hh = $h->{$key};
159 for my $subkey (sort keys %$hh) {
160 print "$key:$subkey:", $hh->{$subkey}, "\n";
161 }
162 }
163 }
164
165 my $out = new FileHandle($OUT_FILE, 'w');
166 die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
167 my $save = select($out);
168 formatData($h, $version);
169 select($save);
170 $out->close();
171
172 exit(0);
173
174 #----------------------------------------------------------------------
175 # From PropList.html: "The properties of the form Other_XXX
176 # are used to generate properties in DerivedCoreProperties.txt.
177 # They are not intended for general use, such as in APIs that
178 # return property values.
179 # Non_Break is not a valid property as of 3.2.
180 sub isIgnoredProperty {
181 local $_ = shift;
182 /^Other_/i || /^Non_Break$/i;
183 }
184
185 # 'qc' is a pseudo-property matching any quick-check property
186 # see PropertyValueAliases.txt file comments. 'binprop' is
187 # a synthetic binary value alias "True"/"False", not present
188 # in PropertyValueAliases.txt until Unicode 5.0.
189 # Starting with Unicode 5.1, PropertyValueAliases.txt does have
190 # explicit values for binary properties.
191 sub isPseudoProperty {
192 $_[0] eq 'qc' ||
193 $_[0] eq 'binprop';
194 }
195
196 #----------------------------------------------------------------------
197 # Emit the combined data from headers and the Unicode database as a
198 # C source code header file.
199 #
200 # @param ref to hash with the data
201 # @param Unicode version, as a string
202 sub formatData {
203 my $h = shift;
204 my $version = shift;
205
206 my $date = scalar localtime();
207 print <<"END";
208 /**
209 * Copyright (C) 2002-$YEAR, International Business Machines Corporation and
210 * others. All Rights Reserved.
211 *
212 * MACHINE GENERATED FILE. !!! Do not edit manually !!!
213 *
214 * Generated from
215 * uchar.h
216 * uscript.h
217 * Blocks.txt
218 * PropertyAliases.txt
219 * PropertyValueAliases.txt
220 *
221 * Date: $date
222 * Unicode version: $version
223 * Script: $0
224 */
225
226 END
227
228 #------------------------------------------------------------
229 # Emit Unicode version
230 print "/* Unicode version $version */\n";
231 my @v = split(/\./, $version);
232 push @v, '0' while (@v < 4);
233 for (my $i=0; $i<@v; ++$i) {
234 print "const uint8_t VERSION_$i = $v[$i];\n";
235 }
236 print "\n";
237
238 #------------------------------------------------------------
239 # Emit String table
240 # [A table of all identifiers, that is, all long or short property
241 # or value names. The list need NOT be sorted; it will be sorted
242 # by the C program. Strings are referenced by their index into
243 # this table. After sorting, a REMAP[] array is used to map the
244 # old position indices to the new positions.]
245 my %strings;
246 for my $prop (sort keys %$h) {
247 my $hh = $h->{$prop};
248 for my $enum (sort keys %$hh) {
249 my @a = split(/\|/, $hh->{$enum});
250 for (@a) {
251 $strings{$_} = 1 if (length($_));
252 }
253 }
254 }
255 my @strings = sort keys %strings;
256 unshift @strings, "";
257
258 print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";
259
260 # while printing, create a mapping hash from string table entry to index
261 my %stringToID;
262 print "/* to be sorted */\n";
263 print "const $STRING_CLASS STRING_TABLE[] = {\n";
264 for (my $i=0; $i<@strings; ++$i) {
265 print " $STRING_CLASS(\"$strings[$i]\", $i),\n";
266 $stringToID{$strings[$i]} = $i;
267 }
268 print "};\n\n";
269
270 # placeholder for the remapping index. this is used to map
271 # indices that we compute here to indices of the sorted
272 # STRING_TABLE. STRING_TABLE will be sorted by the C++ program
273 # using the uprv_comparePropertyNames() function. this will
274 # reshuffle the order. we then use the indices (passed to the
275 # String constructor) to create a REMAP[] array.
276 print "/* to be filled in */\n";
277 print "int32_t REMAP[", scalar @strings, "];\n\n";
278
279 #------------------------------------------------------------
280 # Emit the name group table
281 # [A table of name groups. A name group is one or more names
282 # for a property or property value. The Unicode data files specify
283 # that there may be more than 2, although as of Unicode 3.2 there
284 # are at most 2. The name group table looks like this:
285 #
286 # 114, -115, 116, -117, 0, -118, 65, -64, ...
287 # [0] [2] [4] [6]
288 #
289 # The entry at [0] consists of 2 strings, 114 and 115.
290 # The entry at [2] consists of 116 and 117. The entry at
291 # [4] is one string, 118. There is always at least one
292 # string; typically there are two. If there are two, the first
293 # is the SHORT name and the second is the LONG. If there is
294 # one, then the missing entry (always the short name, in 3.2)
295 # is zero, which is by definition the index of "". The
296 # 'preferred' name will generally be the LONG name, if there are
297 # more than 2 entries. The last entry is negative.
298
299 # Build name group list and replace string refs with nameGroup indices
300 my @nameGroups;
301
302 # Check for duplicate name groups, and reuse them if possible
303 my %groupToInt; # Map group strings to ints
304 for my $prop (sort keys %$h) {
305 my $hh = $h->{$prop};
306 for my $enum (sort keys %$hh) {
307 my $groupString = $hh->{$enum};
308 my $i;
309 if (exists $groupToInt{$groupString}) {
310 $i = $groupToInt{$groupString};
311 } else {
312 my @names = split(/\|/, $groupString);
313 die "Error: Wrong number of names in " . $groupString if (@names < 1);
314 $i = @nameGroups; # index of group we are making
315 $groupToInt{$groupString} = $i; # Cache for reuse
316 push @nameGroups, map { $stringToID{$_} } @names;
317 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
318 }
319 # now, replace string list with ref to name group
320 $hh->{$enum} = $i;
321 }
322 }
323
324 print "const int32_t NAME_GROUP_COUNT = ",
325 scalar @nameGroups, ";\n\n";
326
327 print "int32_t NAME_GROUP[] = {\n";
328 # emit one group per line, with annotations
329 my $max_names = 0;
330 for (my $i=0; $i<@nameGroups; ) {
331 my @a;
332 my $line;
333 my $start = $i;
334 for (;;) {
335 my $j = $nameGroups[$i++];
336 $line .= "$j, ";
337 push @a, abs($j);
338 last if ($j < 0);
339 }
340 print " ",
341 $line,
342 ' 'x(20-length($line)),
343 "/* ", sprintf("%3d", $start),
344 ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
345 $max_names = @a if(@a > $max_names);
346
347 }
348 print "};\n\n";
349
350 # This is fixed for 3.2 at "2" but should be calculated dynamically
351 # when more than 2 names appear in Property[Value]Aliases.txt.
352 print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
353
354 #------------------------------------------------------------
355 # Emit enumerated property values
356 for my $prop (sort keys %$h) {
357 next if ($prop =~ /^_/);
358 my $vh = $h->{$prop};
359 my $count = scalar keys %$vh;
360
361 print "const int32_t VALUES_${prop}_COUNT = ",
362 $count, ";\n\n";
363
364 print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
365 for my $enum (sort keys %$vh) {
366 #my @names = split(/\|/, $vh->{$enum});
367 #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
368 # if (@names != 2);
369 print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
370 #$stringToID{$names[0]}, ", ",
371 #$stringToID{$names[1]}, "),\n";
372 # "\"", $names[0], "\", ",
373 # "\"", $names[1], "\"),\n";
374 }
375 print "};\n\n";
376 }
377
378 #------------------------------------------------------------
379 # Emit top-level properties (binary, enumerated, etc.)
380 for my $topName (sort keys %TOP_PROPS) {
381 my $a = $TOP_PROPS{$topName};
382 my $count = 0;
383 for my $type (@$a) { # "_bp", "_ep", etc.
384 $count += scalar keys %{$h->{$type}};
385 }
386
387 print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
388
389 print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
390
391 for my $type (@$a) { # "_bp", "_ep", etc.
392 my $p = $h->{$type};
393
394 for my $enum (sort keys %$p) {
395 my $name = $strings[$nameGroups[$p->{$enum}]];
396
397 my $valueRef = "0, NULL";
398 if ($type eq '_bp') {
399 $valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
400 }
401 elsif (exists $h->{$name}) {
402 $valueRef = "VALUES_${name}_COUNT, VALUES_$name";
403 }
404
405 print " $PROPERTY_CLASS((int32_t) $enum, ",
406 $p->{$enum}, ", $valueRef),\n";
407 }
408 }
409 print "};\n\n";
410 }
411
412 print "/*eof*/\n";
413 }
414
415 #----------------------------------------------------------------------
416 # Read in the files uchar.h, uscript.h, Blocks.txt,
417 # PropertyAliases.txt, and PropertyValueAliases.txt,
418 # and combine them into one hash.
419 #
420 # @param directory containing headers
421 # @param directory containin Unicode data files
422 #
423 # @return hash ref, Unicode version
424 sub readAndMerge {
425
426 my ($headerDir, $unidataDir) = @_;
427
428 my $h = read_uchar("$headerDir/uchar.h");
429 my $s = read_uscript("$headerDir/uscript.h");
430 my $b = read_Blocks("$unidataDir/Blocks.txt");
431 my $pa = {};
432 read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
433 read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
434 my $va = {};
435 read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt");
436 read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt");
437
438 # Extract property family hash
439 my $fam = $pa->{'_family'};
440 delete $pa->{'_family'};
441
442 # Note: uscript.h has no version string, so don't check it
443 my $version = check_versions([ 'uchar.h', $h ],
444 [ 'Blocks.txt', $b ],
445 [ 'PropertyAliases.txt', $pa ],
446 [ 'PropertyValueAliases.txt', $va ]);
447
448 # Do this BEFORE merging; merging modifies the hashes
449 check_PropertyValueAliases($pa, $va);
450
451 # Dump out the $va hash for debugging
452 if ($DEBUG) {
453 print "Property values hash:\n";
454 for my $key (sort keys %$va) {
455 my $hh = $va->{$key};
456 for my $subkey (sort keys %$hh) {
457 print "$key:$subkey:", $hh->{$subkey}, "\n";
458 }
459 }
460 }
461
462 # Dump out the $s hash for debugging
463 if ($DEBUG) {
464 print "Script hash:\n";
465 for my $key (sort keys %$s) {
466 print "$key:", $s->{$key}, "\n";
467 }
468 }
469
470 # Link in the script data
471 $h->{'sc'} = $s;
472
473 merge_Blocks($h, $b);
474
475 merge_PropertyAliases($h, $pa, $fam);
476
477 merge_PropertyValueAliases($h, $va);
478
479 ($h, $version);
480 }
481
482 #----------------------------------------------------------------------
483 # Ensure that the version strings in the given hashes (under the key
484 # '_version') are compatible. Currently this means they must be
485 # identical, with the exception that "X.Y" will match "X.Y.0".
486 # All hashes must define the key '_version'.
487 #
488 # @param a list of pairs of (file name, hash reference)
489 #
490 # @return the version of all the hashes. Upon return, the '_version'
491 # will be removed from all hashes.
492 sub check_versions {
493 my $version = '';
494 my $msg = '';
495 foreach my $a (@_) {
496 my $name = $a->[0];
497 my $h = $a->[1];
498 die "Error: No version found" unless (exists $h->{'_version'});
499 my $v = $h->{'_version'};
500 delete $h->{'_version'};
501
502 # append ".0" if necessary, to standardize to X.Y.Z
503 $v .= '.0' unless ($v =~ /\.\d+\./);
504 $v .= '.0' unless ($v =~ /\.\d+\./);
505 $msg .= "$name = $v\n";
506 if ($version) {
507 die "Error: Mismatched Unicode versions\n$msg"
508 unless ($version eq $v);
509 } else {
510 $version = $v;
511 }
512 }
513 $version;
514 }
515
516 #----------------------------------------------------------------------
517 # Make sure the property names in PropertyValueAliases.txt match those
518 # in PropertyAliases.txt.
519 #
520 # @param a hash ref from read_PropertyAliases.
521 # @param a hash ref from read_PropertyValueAliases.
522 sub check_PropertyValueAliases {
523 my ($pa, $va) = @_;
524
525 # make a reverse hash of short->long
526 my %rev;
527 for (keys %$pa) { $rev{$pa->{$_}} = $_; }
528
529 for my $prop (keys %$va) {
530 if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
531 print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
532 }
533 }
534 }
535
536 #----------------------------------------------------------------------
537 # Merge blocks data into uchar.h enum data. In the 'blk' subhash all
538 # code point values, as returned from read_uchar, are replaced by
539 # block names, as read from Blocks.txt and returned by read_Blocks.
540 # The match must be 1-to-1. If there is any failure of 1-to-1
541 # mapping, an error is signaled. Upon return, the read_Blocks hash
542 # is emptied of all contents, except for those that failed to match.
543 #
544 # The mapping in the 'blk' subhash, after this function returns, is
545 # from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
546 # pseudo-name, e.g. "Basic Latin".
547 #
548 # @param a hash ref from read_uchar.
549 # @param a hash ref from read_Blocks.
550 sub merge_Blocks {
551 my ($h, $b) = @_;
552
553 die "Error: No blocks data in uchar.h"
554 unless (exists $h->{'blk'});
555 my $blk = $h->{'blk'};
556 for my $enum (keys %$blk) {
557 my $cp = $blk->{$enum};
558 if ($cp && !exists $b->{$cp}) {
559 die "Error: No block found at $cp in Blocks.txt";
560 }
561 # Convert code point to pseudo-name:
562 $blk->{$enum} = $b->{$cp};
563 delete $b->{$cp};
564 }
565 my $err = '';
566 for my $cp (keys %$b) {
567 $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
568 }
569 die $err if ($err);
570 }
571
572 #----------------------------------------------------------------------
573 # Merge property alias names into the uchar.h hash. The subhashes
574 # under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
575 # examined and the values of those subhashes are assumed to be long
576 # names in PropertyAliases.txt. They are validated and replaced by
577 # "<short>|<long>". Upon return, the read_PropertyAliases hash is
578 # emptied of all contents, except for those that failed to match.
579 # Unmatched names in PropertyAliases are listed as a warning but do
580 # NOT cause the script to die.
581 #
582 # @param a hash ref from read_uchar.
583 # @param a hash ref from read_PropertyAliases.
584 # @param a hash mapping long names to property family (e.g., 'binary')
585 sub merge_PropertyAliases {
586 my ($h, $pa, $fam) = @_;
587
588 for my $k (@TOP) {
589 die "Error: No properties data for $k in uchar.h"
590 unless (exists $h->{$k});
591 }
592
593 for my $subh (map { $h->{$_} } @TOP) {
594 for my $enum (keys %$subh) {
595 my $long_name = $subh->{$enum};
596 if (!exists $pa->{$long_name}) {
597 die "Error: Property $long_name not found (or used more than once)";
598 }
599
600 my $value;
601 if($pa->{$long_name} =~ m|^n/a\d*$|) {
602 # replace an "n/a" short name with an empty name (nothing before "|");
603 # don't remove it (don't remove the "|"): there must always be a long name,
604 # and if the short name is removed, then the long name becomes the
605 # short name and there is no long name left (unless there is another alias)
606 $value = "|" . $long_name;
607 } else {
608 $value = $pa->{$long_name} . "|" . $long_name;
609 }
610 if (exists $additional_property_aliases{$long_name}) {
611 $value .= "|" . $additional_property_aliases{$long_name};
612 }
613 $subh->{$enum} = $value;
614 delete $pa->{$long_name};
615 }
616 }
617
618 my @err;
619 for my $name (keys %$pa) {
620 $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
621 if (exists $UNSUPPORTED{$name}) {
622 push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
623 } elsif (!isIgnoredProperty($name)) {
624 push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
625 }
626 }
627 print join("\n", sort @err), "\n" if (@err);
628 }
629
630 #----------------------------------------------------------------------
631 # Return 1 if two names match ignoring whitespace, '-', and '_'.
632 # Used to match names in Blocks.txt with those in PropertyValueAliases.txt
633 # as of Unicode 4.0.
634 sub matchesLoosely {
635 my ($a, $b) = @_;
636 $a =~ s/[\s\-_]//g;
637 $b =~ s/[\s\-_]//g;
638 $a =~ /^$b$/i;
639 }
640
641 #----------------------------------------------------------------------
642 # Merge PropertyValueAliases.txt data into the uchar.h hash. All
643 # properties other than blk, _bp, and _ep are analyzed and mapped to
644 # the names listed in PropertyValueAliases. They are then replaced
645 # with a string of the form "<short>|<long>". The short or long name
646 # may be missing.
647 #
648 # @param a hash ref from read_uchar.
649 # @param a hash ref from read_PropertyValueAliases.
650 sub merge_PropertyValueAliases {
651 my ($h, $va) = @_;
652
653 my %gcCount;
654 for my $prop (keys %$h) {
655 # _bp, _ep handled in merge_PropertyAliases
656 next if ($prop =~ /^_/);
657
658 # Special case: gcm
659 my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
660
661 # find corresponding PropertyValueAliases data
662 die "Error: Can't find $prop in PropertyValueAliases.txt"
663 unless (exists $va->{$prop2});
664 my $pva = $va->{$prop2};
665
666 # match up data
667 my $hh = $h->{$prop};
668 for my $enum (keys %$hh) {
669
670 my $name = $hh->{$enum};
671
672 # look up both long and short & ignore case
673 my $n;
674 if (exists $pva->{$name}) {
675 $n = $name;
676 } else {
677 # iterate (slow)
678 for my $a (keys %$pva) {
679 # case-insensitive match
680 # & case-insensitive reverse match
681 if ($a =~ /^$name$/i ||
682 $pva->{$a} =~ /^$name$/i) {
683 $n = $a;
684 last;
685 }
686 }
687 }
688
689 # For blocks, do a loose match from Blocks.txt pseudo-name
690 # to PropertyValueAliases long name.
691 if (!$n && $prop eq 'blk') {
692 for my $a (keys %$pva) {
693 # The block is only going to match the long name,
694 # but we check both for completeness. As of Unicode
695 # 4.0, blocks do not have short names.
696 if (matchesLoosely($name, $pva->{$a}) ||
697 matchesLoosely($name, $a)) {
698 $n = $a;
699 last;
700 }
701 }
702 }
703
704 die "Error: Property value $prop:$name not found" unless ($n);
705
706 my $l = $n;
707 my $r = $pva->{$n};
708 # convert |n/a\d*| to blank
709 $l = '' if ($l =~ m|^n/a\d*$|);
710 $r = '' if ($r =~ m|^n/a\d*$|);
711
712 $hh->{$enum} = "$l|$r";
713 # Don't delete the 'gc' properties because we need to share
714 # them between 'gc' and 'gcm'. Count each use instead.
715 if ($prop2 eq 'gc') {
716 ++$gcCount{$n};
717 } else {
718 delete $pva->{$n};
719 }
720 }
721 }
722
723 # Merge the combining class values in manually
724 # Add the same values to the synthetic lccc and tccc properties
725 die "Error: No ccc data"
726 unless exists $va->{'ccc'};
727 for my $ccc (keys %{$va->{'ccc'}}) {
728 die "Error: Can't overwrite ccc $ccc"
729 if (exists $h->{'ccc'}->{$ccc});
730 $h->{'lccc'}->{$ccc} =
731 $h->{'tccc'}->{$ccc} =
732 $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
733 }
734 delete $va->{'ccc'};
735
736 # Merge synthetic binary property values in manually.
737 # These are the "True" and "False" value aliases.
738 die "Error: No True/False value aliases"
739 unless exists $va->{'binprop'};
740 for my $bp (keys %{$va->{'binprop'}}) {
741 $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
742 }
743 delete $va->{'binprop'};
744
745 my $err = '';
746 for my $prop (sort keys %$va) {
747 my $hh = $va->{$prop};
748 for my $subkey (sort keys %$hh) {
749 # 'gc' props are shared with 'gcm'; make sure they were used
750 # once or twice.
751 if ($prop eq 'gc') {
752 my $n = $gcCount{$subkey};
753 next if ($n >= 1 && $n <= 2);
754 }
755 $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
756 unless exists $MISSING_FROM_UCHAR{$prop};
757 }
758 }
759 print $err if ($err);
760 }
761
762 #----------------------------------------------------------------------
763 # Read the PropertyAliases.txt file. Return a hash that maps the long
764 # name to the short name. The special key '_version' will map to the
765 # Unicode version of the file. The special key '_family' holds a
766 # subhash that maps long names to a family string, for descriptive
767 # purposes.
768 #
769 # @param a filename for PropertyAliases.txt
770 # @param reference to hash to receive data. Keys are long names.
771 # Values are short names.
772 sub read_PropertyAliases {
773
774 my $hash = shift; # result
775
776 my $filename = shift;
777
778 my $fam = {}; # map long names to family string
779 $fam = $hash->{'_family'} if (exists $hash->{'_family'});
780
781 my $family; # binary, enumerated, etc.
782
783 my $in = new FileHandle($filename, 'r');
784 die "Error: Cannot open $filename" if (!defined $in);
785
786 while (<$in>) {
787
788 # Read version (embedded in a comment)
789 if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
790 die "Error: Multiple versions in $filename"
791 if (exists $hash->{'_version'});
792 $hash->{'_version'} = $1;
793 }
794
795 # Read family heading
796 if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
797 $family = $1;
798 }
799
800 # Ignore comments and blank lines
801 s/\#.*//;
802 next unless (/\S/);
803
804 if (/^\s*(.+?)\s*;/) {
805 my $short = $1;
806 my @fields = /;\s*([^\s;]+)/g;
807 if (@fields < 1 || @fields > 2) {
808 my $number = @fields;
809 die "Error: Wrong number of fields ($number) in $filename at $_";
810 }
811
812 # Make "n/a" strings unique
813 if ($short eq 'n/a') {
814 $short .= sprintf("%03d", $propNA++);
815 }
816 my $long = $fields[0];
817 if ($long eq 'n/a') {
818 $long .= sprintf("%03d", $propNA++);
819 }
820
821 # Add long name->short name to the hash=pa hash table
822 if (exists $hash->{$long}) {
823 die "Error: Duplicate property $long in $filename"
824 }
825 $hash->{$long} = $short;
826 $fam->{$long} = $family;
827
828 # Add the list of further aliases to the additional_property_aliases hash table,
829 # using the long property name as the key.
830 # For example:
831 # White_Space->space|outer_space
832 if (@fields > 1) {
833 my $value = pop @fields;
834 while (@fields > 1) {
835 $value .= "|" . pop @fields;
836 }
837 $additional_property_aliases{$long} = $value;
838 }
839 } else {
840 die "Error: Can't parse $_ in $filename";
841 }
842 }
843
844 $in->close();
845
846 $hash->{'_family'} = $fam;
847 }
848
849 #----------------------------------------------------------------------
850 # Read the PropertyValueAliases.txt file. Return a two level hash
851 # that maps property_short_name:value_short_name:value_long_name. In
852 # the case of the 'ccc' property, the short name is the numeric class
853 # and the long name is "<short>|<long>". The special key '_version'
854 # will map to the Unicode version of the file.
855 #
856 # @param a filename for PropertyValueAliases.txt
857 #
858 # @return a hash reference.
859 sub read_PropertyValueAliases {
860
861 my $hash = shift; # result
862
863 my $filename = shift;
864
865 my $in = new FileHandle($filename, 'r');
866 die "Error: Cannot open $filename" if (!defined $in);
867
868 while (<$in>) {
869
870 # Read version (embedded in a comment)
871 if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
872 die "Error: Multiple versions in $filename"
873 if (exists $hash->{'_version'});
874 $hash->{'_version'} = $1;
875 }
876
877 # Ignore comments and blank lines
878 s/\#.*//;
879 next unless (/\S/);
880
881 if (/^\s*(.+?)\s*;/i) {
882 my $prop = $1;
883 my @fields = /;\s*([^\s;]+)/g;
884 die "Error: Wrong number of fields in $filename"
885 if (@fields < 2 || @fields > 5);
886 # Make "n/a" strings unique
887 $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
888 # Squash extra fields together
889 while (@fields > 2) {
890 my $f = pop @fields;
891 $fields[$#fields] .= '|' . $f;
892 }
893 addDatum($hash, $prop, @fields);
894 }
895
896 else {
897 die "Error: Can't parse $_ in $filename";
898 }
899 }
900
901 $in->close();
902
903 # Script Copt=Qaac (Coptic) is a special case.
904 # Before the Copt code was defined, the private-use code Qaac was used.
905 # Starting with Unicode 4.1, PropertyValueAliases.txt contains
906 # Copt as the short name as well as Qaac as an alias.
907 # For use with older Unicode data files, we add here a Qaac->Coptic entry.
908 # This should not do anything for 4.1-and-later Unicode data files.
909 # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/
910 $hash->{'sc'}->{'Qaac'} = 'Coptic'
911 unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'});
912
913 # Add N|No|T|True and Y|Yes|F|False -- these are values we recognize for
914 # binary properties (until Unicode 5.0 NOT from PropertyValueAliases.txt).
915 # These are of the same form as the 'ccc' value aliases.
916 # Starting with Unicode 5.1, PropertyValueAliases.txt does have values
917 # for binary properties.
918 if (!exists $hash->{'binprop'}->{'0'}) {
919 if (exists $hash->{'Alpha'}->{'N'}) {
920 # Unicode 5.1 and later: Make the numeric value the key.
921 $hash->{'binprop'}->{'0'} = 'N|' . $hash->{'Alpha'}->{'N'};
922 $hash->{'binprop'}->{'1'} = 'Y|' . $hash->{'Alpha'}->{'Y'};
923 } elsif (exists $hash->{'Alpha'}) {
924 die "Error: Unrecognized short value name for binary property 'Alpha'\n";
925 } else {
926 # Unicode 5.0 and earlier: Add manually.
927 $hash->{'binprop'}->{'0'} = 'N|No|F|False';
928 $hash->{'binprop'}->{'1'} = 'Y|Yes|T|True';
929 }
930 }
931 }
932
933 #----------------------------------------------------------------------
934 # Read the Blocks.txt file. Return a hash that maps the code point
935 # range start to the block name. The special key '_version' will map
936 # to the Unicode version of the file.
937 #
938 # As of Unicode 4.0, the names in the Blocks.txt are no longer the
939 # proper names. The proper names are now listed in PropertyValueAliases.
940 # They are similar but not identical. Furthermore, 4.0 introduces
941 # a new block name, No_Block, which is listed only in PropertyValueAliases
942 # and not in Blocks.txt. As a result, we handle blocks as follows:
943 #
944 # 1. Read Blocks.txt to map code point range start to quasi-block name.
945 # 2. Add to Blocks.txt a synthetic No Block code point & name:
946 # X -> No Block
947 # 3. Map quasi-names from Blocks.txt (including No Block) to actual
948 # names from PropertyValueAliases. This occurs in
949 # merge_PropertyValueAliases.
950 #
951 # @param a filename for Blocks.txt
952 #
953 # @return a ref to a hash. Keys are code points, as text, e.g.,
954 # "1720". Values are pseudo-block names, e.g., "Hanunoo".
955 sub read_Blocks {
956
957 my $filename = shift;
958
959 my $hash = {}; # result
960
961 my $in = new FileHandle($filename, 'r');
962 die "Error: Cannot open $filename" if (!defined $in);
963
964 while (<$in>) {
965
966 # Read version (embedded in a comment)
967 if (/Blocks-(\d+\.\d+\.\d+)/i) {
968 die "Error: Multiple versions in $filename"
969 if (exists $hash->{'_version'});
970 $hash->{'_version'} = $1;
971 }
972
973 # Ignore comments and blank lines
974 s/\#.*//;
975 next unless (/\S/);
976
977 if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) {
978 die "Error: Duplicate range $1 in $filename"
979 if (exists $hash->{$1});
980 $hash->{$1} = $2;
981 }
982
983 else {
984 die "Error: Can't parse $_ in $filename";
985 }
986 }
987
988 $in->close();
989
990 # Add pseudo-name for No Block
991 $hash->{'none'} = 'No Block';
992
993 $hash;
994 }
995
996 #----------------------------------------------------------------------
997 # Read the uscript.h file and compile a mapping of Unicode symbols to
998 # icu4c enum values.
999 #
1000 # @param a filename for uscript.h
1001 #
1002 # @return a ref to a hash. The keys of the hash are enum symbols from
1003 # uscript.h, and the values are script names.
1004 sub read_uscript {
1005
1006 my $filename = shift;
1007
1008 my $mode = ''; # state machine mode and submode
1009 my $submode = '';
1010
1011 my $last = ''; # for line folding
1012
1013 my $hash = {}; # result
1014 my $key; # first-level key
1015
1016 my $in = new FileHandle($filename, 'r');
1017 die "Error: Cannot open $filename" if (!defined $in);
1018
1019 while (<$in>) {
1020 # Fold continued lines together
1021 if (/^(.*)\\$/) {
1022 $last = $1;
1023 next;
1024 } elsif ($last) {
1025 $_ = $last . $_;
1026 $last = '';
1027 }
1028
1029 # Exit all modes here
1030 if ($mode && $mode ne 'DEPRECATED') {
1031 if (/^\s*\}/) {
1032 $mode = '';
1033 next;
1034 }
1035 }
1036
1037 # Handle individual modes
1038
1039 if ($mode eq 'UScriptCode') {
1040 if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
1041 my ($enum, $code) = ($1, $2);
1042 die "Error: Duplicate script $enum"
1043 if (exists $hash->{$enum});
1044 $hash->{$enum} = $code;
1045 }
1046 }
1047
1048 elsif ($mode eq 'DEPRECATED') {
1049 if (/\s*\#ifdef/) {
1050 die "Error: Nested #ifdef";
1051 }
1052 elsif (/\s*\#endif/) {
1053 $mode = '';
1054 }
1055 }
1056
1057 elsif (!$mode) {
1058 if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1059 /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1060 $mode = $1;
1061 #print "Parsing $mode\n";
1062 }
1063
1064 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1065 $mode = 'DEPRECATED';
1066 }
1067 }
1068 }
1069
1070 $in->close();
1071
1072 $hash;
1073 }
1074
1075 #----------------------------------------------------------------------
1076 # Read the uchar.h file and compile a mapping of Unicode symbols to
1077 # icu4c enum values.
1078 #
1079 # @param a filename for uchar.h
1080 #
1081 # @return a ref to a hash. The keys of the hash are '_bp' for binary
1082 # properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
1083 # double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
1084 # 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
1085 # value aliases. The values of the hash are subhashes. The subhashes
1086 # have a key of the uchar.h enum symbol, and a value of the alias
1087 # string (as listed in PropertyValueAliases.txt). NOTE: The alias
1088 # string is whatever alias uchar.h lists. This may be either short or
1089 # long, depending on the specific enum. NOTE: For blocks ('blk'), the
1090 # value is a hex code point for the start of the associated block.
1091 # NOTE: The special key _version will map to the Unicode version of
1092 # the file.
1093 sub read_uchar {
1094
1095 my $filename = shift;
1096
1097 my $mode = ''; # state machine mode and submode
1098 my $submode = '';
1099
1100 my $last = ''; # for line folding
1101
1102 my $hash = {}; # result
1103 my $key; # first-level key
1104
1105 my $in = new FileHandle($filename, 'r');
1106 die "Error: Cannot open $filename" if (!defined $in);
1107
1108 while (<$in>) {
1109 # Fold continued lines together
1110 if (/^(.*)\\$/) {
1111 $last .= $1;
1112 next;
1113 } elsif ($last) {
1114 $_ = $last . $_;
1115 $last = '';
1116 }
1117
1118 # Exit all modes here
1119 if ($mode && $mode ne 'DEPRECATED') {
1120 if (/^\s*\}/) {
1121 $mode = '';
1122 next;
1123 }
1124 }
1125
1126 # Handle individual modes
1127
1128 if ($mode eq 'UProperty') {
1129 if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
1130 if ($submode) {
1131 addDatum($hash, $key, $1, $submode);
1132 $submode = '';
1133 } else {
1134 #print "Warning: Ignoring $1\n";
1135 }
1136 }
1137
1138 elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
1139 die "Error: Unmatched tag $submode" if ($submode);
1140 die "Error: Unrecognized UProperty comment: $_"
1141 unless (exists $PROP_TYPE{$1});
1142 $key = $PROP_TYPE{$1};
1143 $submode = $2;
1144 }
1145 }
1146
1147 elsif ($mode eq 'UCharCategory') {
1148 if (/^\s*(U_\w+)\s*=/) {
1149 if ($submode) {
1150 addDatum($hash, 'gc', $1, $submode);
1151 $submode = '';
1152 } else {
1153 #print "Warning: Ignoring $1\n";
1154 }
1155 }
1156
1157 elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
1158 die "Error: Unmatched tag $submode" if ($submode);
1159 $submode = $1;
1160 }
1161 }
1162
1163 elsif ($mode eq 'UCharDirection') {
1164 if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
1165 if ($submode) {
1166 addDatum($hash, $key, $1, $submode);
1167 $submode = '';
1168 } else {
1169 #print "Warning: Ignoring $1\n";
1170 }
1171 }
1172
1173 elsif (m|/\*\*\s*([A-Z]+)\s|) {
1174 die "Error: Unmatched tag $submode" if ($submode);
1175 $key = 'bc';
1176 $submode = $1;
1177 }
1178 }
1179
1180 elsif ($mode eq 'UBlockCode') {
1181 if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
1182 addDatum($hash, 'blk', $1, $2);
1183 }
1184 }
1185
1186 elsif ($mode eq 'UEastAsianWidth') {
1187 if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
1188 addDatum($hash, 'ea', $1, $2);
1189 }
1190 }
1191
1192 elsif ($mode eq 'UDecompositionType') {
1193 if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
1194 addDatum($hash, 'dt', $1, $2);
1195 }
1196 }
1197
1198 elsif ($mode eq 'UJoiningType') {
1199 if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
1200 addDatum($hash, 'jt', $1, $2);
1201 }
1202 }
1203
1204 elsif ($mode eq 'UJoiningGroup') {
1205 if (/^\s*(U_JG_(\w+))/) {
1206 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
1207 }
1208 }
1209
1210 elsif ($mode eq 'UGraphemeClusterBreak') {
1211 if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) {
1212 addDatum($hash, 'GCB', $1, $2);
1213 }
1214 }
1215
1216 elsif ($mode eq 'UWordBreakValues') {
1217 if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) {
1218 addDatum($hash, 'WB', $1, $2);
1219 }
1220 }
1221
1222 elsif ($mode eq 'USentenceBreak') {
1223 if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) {
1224 addDatum($hash, 'SB', $1, $2);
1225 }
1226 }
1227
1228 elsif ($mode eq 'ULineBreak') {
1229 if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
1230 addDatum($hash, 'lb', $1, $2);
1231 }
1232 }
1233
1234 elsif ($mode eq 'UNumericType') {
1235 if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
1236 addDatum($hash, 'nt', $1, $2);
1237 }
1238 }
1239
1240 elsif ($mode eq 'UHangulSyllableType') {
1241 if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
1242 addDatum($hash, 'hst', $1, $2);
1243 }
1244 }
1245
1246 elsif ($mode eq 'DEPRECATED') {
1247 if (/\s*\#ifdef/) {
1248 die "Error: Nested #ifdef";
1249 }
1250 elsif (/\s*\#endif/) {
1251 $mode = '';
1252 }
1253 }
1254
1255 elsif (!$mode) {
1256 if (/^\s*\#define\s+(\w+)\s+(.+)/) {
1257 # #define $left $right
1258 my ($left, $right) = ($1, $2);
1259
1260 if ($left eq 'U_UNICODE_VERSION') {
1261 my $version = $right;
1262 $version = $1 if ($version =~ /^\"(.*)\"/);
1263 # print "Unicode version: ", $version, "\n";
1264 die "Error: Multiple versions in $filename"
1265 if (defined $hash->{'_version'});
1266 $hash->{'_version'} = $version;
1267 }
1268
1269 elsif ($left =~ /U_GC_(\w+?)_MASK/) {
1270 addDatum($hash, 'gcm', $left, $1);
1271 }
1272 }
1273
1274 elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1275 /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1276 $mode = $1;
1277 #print "Parsing $mode\n";
1278 }
1279
1280 elsif (/^\s*enum\s+(\w+)\s*\{/ ||
1281 /^\s*enum\s+(\w+)\s*$/) {
1282 $mode = $1;
1283 #print "Parsing $mode\n";
1284 }
1285
1286 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1287 $mode = 'DEPRECATED';
1288 }
1289 }
1290 }
1291
1292 $in->close();
1293
1294 # hardcode known values for the normalization quick check properties
1295 # see unorm.h for the UNormalizationCheckResult enum
1296
1297 addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N');
1298 addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y');
1299 addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
1300
1301 addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N');
1302 addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y');
1303 addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
1304
1305 # no "maybe" values for NF[K]D
1306
1307 addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N');
1308 addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y');
1309
1310 addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N');
1311 addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y');
1312
1313 $hash;
1314 }
1315
1316 #----------------------------------------------------------------------
1317 # Add a new value to a two-level hash. That is, given a ref to
1318 # a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
1319 sub addDatum {
1320 my ($h, $k1, $k2, $v) = @_;
1321 if (exists $h->{$k1}->{$k2}) {
1322 die "Error: $k1:$k2 already set to " .
1323 $h->{$k1}->{$k2} . ", cannot set to " . $v;
1324 }
1325 $h->{$k1}->{$k2} = $v;
1326 }
1327
1328 #eof