]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/genpname/preparse.pl
ICU-6.2.13.tar.gz
[apple/icu.git] / icuSources / tools / genpname / preparse.pl
1 #!/bin/perl -w
2 #*******************************************************************
3 # COPYRIGHT:
4 # Copyright (c) 2002-2004, International Business Machines Corporation and
5 # others. All Rights Reserved.
6 #*******************************************************************
7
8 # This script reads in UCD files PropertyAliases.txt and
9 # PropertyValueAliases.txt and correlates them with ICU enums
10 # defined in uchar.h and uscript.h. It then outputs a header
11 # file which contains all names and enums. The header is included
12 # by the genpname tool C++ source file, which produces the actual
13 # binary data file.
14 #
15 # See usage note below.
16 #
17 # TODO: The Property[Value]Alias.txt files state that they can support
18 # more than 2 names per property|value. Currently (Unicode 3.2) there
19 # are always 1 or 2 names. If more names were supported, presumably
20 # the format would be something like:
21 # nv ; Numeric_Value
22 # nv ; Value_Numerique
23 # CURRENTLY, this script assumes that there are 1 or two names. Any
24 # duplicates it sees are flagged as an error. If multiple aliases
25 # appear in a future version of Unicode, modify this script to support
26 # that.
27 #
28 # NOTE: As of ICU 2.6, this script has been modified to know about the
29 # pseudo-property gcm/General_Category_Mask, which corresponds to the
30 # uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property
31 # corresponds to General_Category but is a bitmask value. It does not
32 # exist in the UCD. Therefore, I special case it in several places
33 # (search for General_Category_Mask and gcm).
34 #
35 # NOTE: As of ICU 2.6, this script reads an auxiliary data file,
36 # SyntheticPropertyAliases.txt, containing property aliases not
37 # present in the UCD but present in ICU. This file resides in the
38 # same directory as this script. Its contents are merged into those
39 # of PropertyAliases.txt as if the two files were appended.
40 #
41 # NOTE: The following names are handled specially. See script below
42 # for details.
43 #
44 # T/True
45 # F/False
46 # No_Block
47 #
48 # Author: Alan Liu
49 # Created: October 14 2002
50 # Since: ICU 2.4
51
52 use FileHandle;
53 use strict;
54 use Dumpvalue;
55
56 my $DEBUG = 1;
57 my $DUMPER = new Dumpvalue;
58
59 my $count = @ARGV;
60 my $ICU_DIR = shift() || '';
61 my $OUT_FILE = shift() || 'data.h';
62 my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
63 my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
64
65 # Get the current year from the system
66 my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
67
68 #----------------------------------------------------------------------
69 # Top level property keys for binary, enumerated, string, and double props
70 my @TOP = qw( _bp _ep _sp _dp _mp );
71
72 # This hash governs how top level properties are grouped into output arrays.
73 #my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ],
74 # "NO_VALUE" => [ '_sp', '_dp' ] );m
75 #my %TOP_PROPS = ( "BINARY" => [ '_bp' ],
76 # "ENUMERATED" => [ '_ep' ],
77 # "STRING" => [ '_sp' ],
78 # "DOUBLE" => [ '_dp' ] );
79 my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
80
81 my %PROP_TYPE = (Binary => "_bp",
82 String => "_sp",
83 Double => "_dp",
84 Enumerated => "_ep",
85 Bitmask => "_mp");
86 #----------------------------------------------------------------------
87
88 # Properties that are unsupported in ICU
89 my %UNSUPPORTED = (Composition_Exclusion => 1,
90 Decomposition_Mapping => 1,
91 Expands_On_NFC => 1,
92 Expands_On_NFD => 1,
93 Expands_On_NFKC => 1,
94 Expands_On_NFKD => 1,
95 FC_NFKC_Closure => 1,
96 ID_Start_Exceptions => 1,
97 Special_Case_Condition => 1,
98 );
99
100 # Short names of properties that weren't seen in uchar.h. If the
101 # properties weren't seen, don't complain about the property values
102 # missing.
103 my %MISSING_FROM_UCHAR;
104
105 #----------------------------------------------------------------------
106
107 # Emitted class names
108 my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
109
110 if ($count < 1 || $count > 2 ||
111 !-d $HEADER_DIR ||
112 !-d $UNIDATA_DIR) {
113 my $me = $0;
114 $me =~ s|.+[/\\]||;
115 my $lm = ' ' x length($me);
116 print <<"END";
117
118 $me: Reads ICU4C headers and Unicode data files and creates
119 $lm a C header file that is included by genpname. The header
120 $lm file matches constants defined in the ICU4C headers with
121 $lm property|value aliases in the Unicode data files.
122
123 Usage: $me <icu_dir> [<out_file>]
124
125 <icu_dir> ICU4C root directory, containing
126 source/common/unicode/uchar.h
127 source/common/unicode/uscript.h
128 source/data/unidata/Blocks.txt
129 source/data/unidata/PropertyAliases.txt
130 source/data/unidata/PropertyValueAliases.txt
131 <out_file> File name of header to be written;
132 default is 'data.h'.
133
134 The Unicode versions of all input files must match.
135 END
136 exit(1);
137 }
138
139 my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
140
141 if ($DEBUG) {
142 print "Merged hash:\n";
143 for my $key (sort keys %$h) {
144 my $hh = $h->{$key};
145 for my $subkey (sort keys %$hh) {
146 print "$key:$subkey:", $hh->{$subkey}, "\n";
147 }
148 }
149 }
150
151 my $out = new FileHandle($OUT_FILE, 'w');
152 die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
153 my $save = select($out);
154 formatData($h, $version);
155 select($save);
156 $out->close();
157
158 exit(0);
159
160 #----------------------------------------------------------------------
161 # From PropList.html: "The properties of the form Other_XXX
162 # are used to generate properties in DerivedCoreProperties.txt.
163 # They are not intended for general use, such as in APIs that
164 # return property values.
165 # Non_Break is not a valid property as of 3.2.
166 sub isIgnoredProperty {
167 local $_ = shift;
168 /^Other_/i || /^Non_Break$/i;
169 }
170
171 # 'qc' is a pseudo-property matching any quick-check property
172 # see PropertyValueAliases.txt file comments. 'binprop' is
173 # a synthetic binary value alias "True"/"False", not present
174 # in PropertyValueAliases.txt.
175 sub isPseudoProperty {
176 $_[0] eq 'qc' ||
177 $_[0] eq 'binprop';
178 }
179
180 #----------------------------------------------------------------------
181 # Emit the combined data from headers and the Unicode database as a
182 # C source code header file.
183 #
184 # @param ref to hash with the data
185 # @param Unicode version, as a string
186 sub formatData {
187 my $h = shift;
188 my $version = shift;
189
190 my $date = scalar localtime();
191 print <<"END";
192 /**
193 * Copyright (C) 2002-$YEAR, International Business Machines Corporation and
194 * others. All Rights Reserved.
195 *
196 * MACHINE GENERATED FILE. !!! Do not edit manually !!!
197 *
198 * Generated from
199 * uchar.h
200 * uscript.h
201 * Blocks.txt
202 * PropertyAliases.txt
203 * PropertyValueAliases.txt
204 *
205 * Date: $date
206 * Unicode version: $version
207 * Script: $0
208 */
209
210 END
211
212 #------------------------------------------------------------
213 # Emit Unicode version
214 print "/* Unicode version $version */\n";
215 my @v = split(/\./, $version);
216 push @v, '0' while (@v < 4);
217 for (my $i=0; $i<@v; ++$i) {
218 print "const uint8_t VERSION_$i = $v[$i];\n";
219 }
220 print "\n";
221
222 #------------------------------------------------------------
223 # Emit String table
224 # [A table of all identifiers, that is, all long or short property
225 # or value names. The list need NOT be sorted; it will be sorted
226 # by the C program. Strings are referenced by their index into
227 # this table. After sorting, a REMAP[] array is used to map the
228 # old position indices to the new positions.]
229 my %strings;
230 for my $prop (sort keys %$h) {
231 my $hh = $h->{$prop};
232 for my $enum (sort keys %$hh) {
233 my @a = split(/\|/, $hh->{$enum});
234 for (@a) {
235 $strings{$_} = 1 if (length($_));
236 }
237 }
238 }
239 my @strings = sort keys %strings;
240 unshift @strings, "";
241
242 print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";
243
244 # while printing, create a mapping hash from string table entry to index
245 my %stringToID;
246 print "/* to be sorted */\n";
247 print "const $STRING_CLASS STRING_TABLE[] = {\n";
248 for (my $i=0; $i<@strings; ++$i) {
249 print " $STRING_CLASS(\"$strings[$i]\", $i),\n";
250 $stringToID{$strings[$i]} = $i;
251 }
252 print "};\n\n";
253
254 # placeholder for the remapping index. this is used to map
255 # indices that we compute here to indices of the sorted
256 # STRING_TABLE. STRING_TABLE will be sorted by the C++ program
257 # using the uprv_comparePropertyNames() function. this will
258 # reshuffle the order. we then use the indices (passed to the
259 # String constructor) to create a REMAP[] array.
260 print "/* to be filled in */\n";
261 print "int32_t REMAP[", scalar @strings, "];\n\n";
262
263 #------------------------------------------------------------
264 # Emit the name group table
265 # [A table of name groups. A name group is one or more names
266 # for a property or property value. The Unicode data files specify
267 # that there may be more than 2, although as of Unicode 3.2 there
268 # are at most 2. The name group table looks like this:
269 #
270 # 114, -115, 116, -117, 0, -118, 65, -64, ...
271 # [0] [2] [4] [6]
272 #
273 # The entry at [0] consists of 2 strings, 114 and 115.
274 # The entry at [2] consists of 116 and 117. The entry at
275 # [4] is one string, 118. There is always at least one
276 # string; typically there are two. If there are two, the first
277 # is the SHORT name and the second is the LONG. If there is
278 # one, then the missing entry (always the short name, in 3.2)
279 # is zero, which is by definition the index of "". The
280 # 'preferred' name will generally be the LONG name, if there are
281 # more than 2 entries. The last entry is negative.
282
283 # Build name group list and replace string refs with nameGroup indices
284 my @nameGroups;
285
286 # Check for duplicate name groups, and reuse them if possible
287 my %groupToInt; # Map group strings to ints
288 for my $prop (sort keys %$h) {
289 my $hh = $h->{$prop};
290 for my $enum (sort keys %$hh) {
291 my $groupString = $hh->{$enum};
292 my $i;
293 if (exists $groupToInt{$groupString}) {
294 $i = $groupToInt{$groupString};
295 } else {
296 my @names = split(/\|/, $groupString);
297 die "Error: Wrong number of names in " . $groupString if (@names < 2);
298 $i = @nameGroups; # index of group we are making
299 $groupToInt{$groupString} = $i; # Cache for reuse
300 push @nameGroups, map { $stringToID{$_} } @names;
301 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
302 }
303 # now, replace string list with ref to name group
304 $hh->{$enum} = $i;
305 }
306 }
307
308 print "const int32_t NAME_GROUP_COUNT = ",
309 scalar @nameGroups, ";\n\n";
310
311 print "int32_t NAME_GROUP[] = {\n";
312 # emit one group per line, with annotations
313 my $max_names = 0;
314 for (my $i=0; $i<@nameGroups; ) {
315 my @a;
316 my $line;
317 my $start = $i;
318 for (;;) {
319 my $j = $nameGroups[$i++];
320 $line .= "$j, ";
321 push @a, abs($j);
322 last if ($j < 0);
323 }
324 print " ",
325 $line,
326 ' 'x(20-length($line)),
327 "/* ", sprintf("%3d", $start),
328 ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
329 $max_names = @a if(@a > $max_names);
330
331 }
332 print "};\n\n";
333
334 # This is fixed for 3.2 at "2" but should be calculated dynamically
335 # when more than 2 names appear in Property[Value]Aliases.txt.
336 print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
337
338 #------------------------------------------------------------
339 # Emit enumerated property values
340 for my $prop (sort keys %$h) {
341 next if ($prop =~ /^_/);
342 my $vh = $h->{$prop};
343 my $count = scalar keys %$vh;
344
345 print "const int32_t VALUES_${prop}_COUNT = ",
346 $count, ";\n\n";
347
348 print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
349 for my $enum (sort keys %$vh) {
350 #my @names = split(/\|/, $vh->{$enum});
351 #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
352 # if (@names != 2);
353 print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
354 #$stringToID{$names[0]}, ", ",
355 #$stringToID{$names[1]}, "),\n";
356 # "\"", $names[0], "\", ",
357 # "\"", $names[1], "\"),\n";
358 }
359 print "};\n\n";
360 }
361
362 #------------------------------------------------------------
363 # Emit top-level properties (binary, enumerated, etc.)
364 for my $topName (sort keys %TOP_PROPS) {
365 my $a = $TOP_PROPS{$topName};
366 my $count = 0;
367 for my $type (@$a) { # "_bp", "_ep", etc.
368 $count += scalar keys %{$h->{$type}};
369 }
370
371 print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
372
373 print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
374
375 for my $type (@$a) { # "_bp", "_ep", etc.
376 my $p = $h->{$type};
377
378 for my $enum (sort keys %$p) {
379 my $name = $strings[$nameGroups[$p->{$enum}]];
380
381 my $valueRef = "0, NULL";
382 if ($type eq '_bp') {
383 $valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
384 }
385 elsif (exists $h->{$name}) {
386 $valueRef = "VALUES_${name}_COUNT, VALUES_$name";
387 }
388
389 print " $PROPERTY_CLASS((int32_t) $enum, ",
390 $p->{$enum}, ", $valueRef),\n";
391 }
392 }
393 print "};\n\n";
394 }
395
396 print "/*eof*/\n";
397 }
398
399 #----------------------------------------------------------------------
400 # Read in the files uchar.h, uscript.h, Blocks.txt,
401 # PropertyAliases.txt, and PropertyValueAliases.txt,
402 # and combine them into one hash.
403 #
404 # @param directory containing headers
405 # @param directory containin Unicode data files
406 #
407 # @return hash ref, Unicode version
408 sub readAndMerge {
409
410 my ($headerDir, $unidataDir) = @_;
411
412 my $h = read_uchar("$headerDir/uchar.h");
413 my $s = read_uscript("$headerDir/uscript.h");
414 my $b = read_Blocks("$unidataDir/Blocks.txt");
415 my $pa = {};
416 read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
417 read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
418 my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt");
419
420 # Extract property family hash
421 my $fam = $pa->{'_family'};
422 delete $pa->{'_family'};
423
424 # Note: uscript.h has no version string, so don't check it
425 my $version = check_versions([ 'uchar.h', $h ],
426 [ 'Blocks.txt', $b ],
427 [ 'PropertyAliases.txt', $pa ],
428 [ 'PropertyValueAliases.txt', $va ]);
429
430 # Do this BEFORE merging; merging modifies the hashes
431 check_PropertyValueAliases($pa, $va);
432
433 # Dump out the $va hash for debugging
434 if ($DEBUG) {
435 print "Property values hash:\n";
436 for my $key (sort keys %$va) {
437 my $hh = $va->{$key};
438 for my $subkey (sort keys %$hh) {
439 print "$key:$subkey:", $hh->{$subkey}, "\n";
440 }
441 }
442 }
443
444 # Dump out the $s hash for debugging
445 if ($DEBUG) {
446 print "Script hash:\n";
447 for my $key (sort keys %$s) {
448 print "$key:", $s->{$key}, "\n";
449 }
450 }
451
452 # Link in the script data
453 $h->{'sc'} = $s;
454
455 merge_Blocks($h, $b);
456
457 merge_PropertyAliases($h, $pa, $fam);
458
459 merge_PropertyValueAliases($h, $va);
460
461 ($h, $version);
462 }
463
464 #----------------------------------------------------------------------
465 # Ensure that the version strings in the given hashes (under the key
466 # '_version') are compatible. Currently this means they must be
467 # identical, with the exception that "X.Y" will match "X.Y.0".
468 # All hashes must define the key '_version'.
469 #
470 # @param a list of pairs of (file name, hash reference)
471 #
472 # @return the version of all the hashes. Upon return, the '_version'
473 # will be removed from all hashes.
474 sub check_versions {
475 my $version = '';
476 my $msg = '';
477 foreach my $a (@_) {
478 my $name = $a->[0];
479 my $h = $a->[1];
480 die "Error: No version found" unless (exists $h->{'_version'});
481 my $v = $h->{'_version'};
482 delete $h->{'_version'};
483
484 # append ".0" if necessary, to standardize to X.Y.Z
485 $v .= '.0' unless ($v =~ /\.\d+\./);
486 $v .= '.0' unless ($v =~ /\.\d+\./);
487 $msg .= "$name = $v\n";
488 if ($version) {
489 die "Error: Mismatched Unicode versions\n$msg"
490 unless ($version eq $v);
491 } else {
492 $version = $v;
493 }
494 }
495 $version;
496 }
497
498 #----------------------------------------------------------------------
499 # Make sure the property names in PropertyValueAliases.txt match those
500 # in PropertyAliases.txt.
501 #
502 # @param a hash ref from read_PropertyAliases.
503 # @param a hash ref from read_PropertyValueAliases.
504 sub check_PropertyValueAliases {
505 my ($pa, $va) = @_;
506
507 # make a reverse hash of short->long
508 my %rev;
509 for (keys %$pa) { $rev{$pa->{$_}} = $_; }
510
511 for my $prop (keys %$va) {
512 if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
513 print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
514 }
515 }
516 }
517
518 #----------------------------------------------------------------------
519 # Merge blocks data into uchar.h enum data. In the 'blk' subhash all
520 # code point values, as returned from read_uchar, are replaced by
521 # block names, as read from Blocks.txt and returned by read_Blocks.
522 # The match must be 1-to-1. If there is any failure of 1-to-1
523 # mapping, an error is signaled. Upon return, the read_Blocks hash
524 # is emptied of all contents, except for those that failed to match.
525 #
526 # The mapping in the 'blk' subhash, after this function returns, is
527 # from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
528 # pseudo-name, e.g. "Basic Latin".
529 #
530 # @param a hash ref from read_uchar.
531 # @param a hash ref from read_Blocks.
532 sub merge_Blocks {
533 my ($h, $b) = @_;
534
535 die "Error: No blocks data in uchar.h"
536 unless (exists $h->{'blk'});
537 my $blk = $h->{'blk'};
538 for my $enum (keys %$blk) {
539 my $cp = $blk->{$enum};
540 if ($cp && !exists $b->{$cp}) {
541 die "Error: No block found at $cp in Blocks.txt";
542 }
543 # Convert code point to pseudo-name:
544 $blk->{$enum} = $b->{$cp};
545 delete $b->{$cp};
546 }
547 my $err = '';
548 for my $cp (keys %$b) {
549 $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
550 }
551 die $err if ($err);
552 }
553
554 #----------------------------------------------------------------------
555 # Merge property alias names into the uchar.h hash. The subhashes
556 # under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
557 # examined and the values of those subhashes are assumed to be long
558 # names in PropertyAliases.txt. They are validated and replaced by
559 # "<short>|<long>". Upon return, the read_PropertyAliases hash is
560 # emptied of all contents, except for those that failed to match.
561 # Unmatched names in PropertyAliases are listed as a warning but do
562 # NOT cause the script to die.
563 #
564 # @param a hash ref from read_uchar.
565 # @param a hash ref from read_PropertyAliases.
566 # @param a hash mapping long names to property family (e.g., 'binary')
567 sub merge_PropertyAliases {
568 my ($h, $pa, $fam) = @_;
569
570 for my $k (@TOP) {
571 die "Error: No properties data for $k in uchar.h"
572 unless (exists $h->{$k});
573 }
574
575 for my $subh (map { $h->{$_} } @TOP) {
576 for my $enum (keys %$subh) {
577 my $name = $subh->{$enum};
578 die "Error: Property $name not found (or used more than once)"
579 unless (exists $pa->{$name});
580
581 $subh->{$enum} = $pa->{$name} . "|" . $name;
582 delete $pa->{$name};
583 }
584 }
585 my @err;
586 for my $name (keys %$pa) {
587 $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
588 if (exists $UNSUPPORTED{$name}) {
589 push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
590 } elsif (!isIgnoredProperty($name)) {
591 push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
592 }
593 }
594 print join("\n", sort @err), "\n" if (@err);
595 }
596
597 #----------------------------------------------------------------------
598 # Return 1 if two names match ignoring whitespace, '-', and '_'.
599 # Used to match names in Blocks.txt with those in PropertyValueAliases.txt
600 # as of Unicode 4.0.
601 sub matchesLoosely {
602 my ($a, $b) = @_;
603 $a =~ s/[\s\-_]//g;
604 $b =~ s/[\s\-_]//g;
605 $a =~ /^$b$/i;
606 }
607
608 #----------------------------------------------------------------------
609 # Merge PropertyValueAliases.txt data into the uchar.h hash. All
610 # properties other than blk, _bp, and _ep are analyzed and mapped to
611 # the names listed in PropertyValueAliases. They are then replaced
612 # with a string of the form "<short>|<long>". The short or long name
613 # may be missing.
614 #
615 # @param a hash ref from read_uchar.
616 # @param a hash ref from read_PropertyValueAliases.
617 sub merge_PropertyValueAliases {
618 my ($h, $va) = @_;
619
620 my %gcCount;
621 for my $prop (keys %$h) {
622 # _bp, _ep handled in merge_PropertyAliases
623 next if ($prop =~ /^_/);
624
625 # Special case: gcm
626 my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
627
628 # find corresponding PropertyValueAliases data
629 die "Error: Can't find $prop in PropertyValueAliases.txt"
630 unless (exists $va->{$prop2});
631 my $pva = $va->{$prop2};
632
633 # match up data
634 my $hh = $h->{$prop};
635 for my $enum (keys %$hh) {
636
637 my $name = $hh->{$enum};
638
639 # look up both long and short & ignore case
640 my $n;
641 if (exists $pva->{$name}) {
642 $n = $name;
643 } else {
644 # iterate (slow)
645 for my $a (keys %$pva) {
646 # case-insensitive match
647 # & case-insensitive reverse match
648 if ($a =~ /^$name$/i ||
649 $pva->{$a} =~ /^$name$/i) {
650 $n = $a;
651 last;
652 }
653 }
654 }
655
656 # For blocks, do a loose match from Blocks.txt pseudo-name
657 # to PropertyValueAliases long name.
658 if (!$n && $prop eq 'blk') {
659 for my $a (keys %$pva) {
660 # The block is only going to match the long name,
661 # but we check both for completeness. As of Unicode
662 # 4.0, blocks do not have short names.
663 if (matchesLoosely($name, $pva->{$a}) ||
664 matchesLoosely($name, $a)) {
665 $n = $a;
666 last;
667 }
668 }
669 }
670
671 die "Error: Property value $prop:$name not found" unless ($n);
672
673 my $l = $n;
674 my $r = $pva->{$n};
675 # convert |n/a\d+| to blank
676 $l = '' if ($l =~ m|^n/a\d+$|);
677 $r = '' if ($r =~ m|^n/a\d+$|);
678
679 $hh->{$enum} = "$l|$r";
680 # Don't delete the 'gc' properties because we need to share
681 # them between 'gc' and 'gcm'. Count each use instead.
682 if ($prop2 eq 'gc') {
683 ++$gcCount{$n};
684 } else {
685 delete $pva->{$n};
686 }
687 }
688 }
689
690 # Merge the combining class values in manually
691 # Add the same values to the synthetic lccc and tccc properties
692 die "Error: No ccc data"
693 unless exists $va->{'ccc'};
694 for my $ccc (keys %{$va->{'ccc'}}) {
695 die "Error: Can't overwrite ccc $ccc"
696 if (exists $h->{'ccc'}->{$ccc});
697 $h->{'lccc'}->{$ccc} =
698 $h->{'tccc'}->{$ccc} =
699 $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
700 }
701 delete $va->{'ccc'};
702
703 # Merge synthetic binary property values in manually.
704 # These are the "True" and "False" value aliases.
705 die "Error: No True/False value aliases"
706 unless exists $va->{'binprop'};
707 for my $bp (keys %{$va->{'binprop'}}) {
708 $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
709 }
710 delete $va->{'binprop'};
711
712 my $err = '';
713 for my $prop (sort keys %$va) {
714 my $hh = $va->{$prop};
715 for my $subkey (sort keys %$hh) {
716 # 'gc' props are shared with 'gcm'; make sure they were used
717 # once or twice.
718 if ($prop eq 'gc') {
719 my $n = $gcCount{$subkey};
720 next if ($n >= 1 && $n <= 2);
721 }
722 $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
723 unless exists $MISSING_FROM_UCHAR{$prop};
724 }
725 }
726 print $err if ($err);
727 }
728
729 #----------------------------------------------------------------------
730 # Read the PropertyAliases.txt file. Return a hash that maps the long
731 # name to the short name. The special key '_version' will map to the
732 # Unicode version of the file. The special key '_family' holds a
733 # subhash that maps long names to a family string, for descriptive
734 # purposes.
735 #
736 # @param a filename for PropertyAliases.txt
737 # @param reference to hash to receive data. Keys are long names.
738 # Values are short names.
739 sub read_PropertyAliases {
740
741 my $hash = shift; # result
742
743 my $filename = shift;
744
745 my $fam = {}; # map long names to family string
746 $fam = $hash->{'_family'} if (exists $hash->{'_family'});
747
748 my $family; # binary, enumerated, etc.
749
750 my $in = new FileHandle($filename, 'r');
751 die "Error: Cannot open $filename" if (!defined $in);
752
753 while (<$in>) {
754
755 # Read version (embedded in a comment)
756 if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
757 die "Error: Multiple versions in $filename"
758 if (exists $hash->{'_version'});
759 $hash->{'_version'} = $1;
760 }
761
762 # Read family heading
763 if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
764 $family = $1;
765 }
766
767 # Ignore comments and blank lines
768 s/\#.*//;
769 next unless (/\S/);
770
771 if (/^\s*(.+?)\s*;\s*(.+?)\s*$/i) {
772 die "Error: Duplicate property $1 in $filename"
773 if (exists $hash->{$2});
774 $hash->{$2} = $1;
775 $fam->{$2} = $family;
776 }
777
778 else {
779 die "Error: Can't parse $_ in $filename";
780 }
781 }
782
783 $in->close();
784
785 $hash->{'_family'} = $fam;
786 }
787
788 #----------------------------------------------------------------------
789 # Read the PropertyValueAliases.txt file. Return a two level hash
790 # that maps property_short_name:value_short_name:value_long_name. In
791 # the case of the 'ccc' property, the short name is the numeric class
792 # and the long name is "<short>|<long>". The special key '_version'
793 # will map to the Unicode version of the file.
794 #
795 # @param a filename for PropertyValueAliases.txt
796 #
797 # @return a hash reference.
798 sub read_PropertyValueAliases {
799
800 my $filename = shift;
801
802 my $hash = {}; # result
803
804 my $in = new FileHandle($filename, 'r');
805 die "Error: Cannot open $filename" if (!defined $in);
806
807 my $sym = 0; # Used to make "n/a" strings unique
808
809 while (<$in>) {
810
811 # Read version (embedded in a comment)
812 if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
813 die "Error: Multiple versions in $filename"
814 if (exists $hash->{'_version'});
815 $hash->{'_version'} = $1;
816 }
817
818 # Ignore comments and blank lines
819 s/\#.*//;
820 next unless (/\S/);
821
822 if (/^\s*(.+?)\s*;/i) {
823 my $prop = $1;
824 my @fields = /;\s*([^\s;]+)/g;
825 die "Error: Wrong number of fields"
826 if (@fields < 2 || @fields > 3);
827 # Make "n/a" strings unique
828 $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a');
829 # Squash extra fields together
830 while (@fields > 2) {
831 my $f = pop @fields;
832 $fields[$#fields] .= '|' . $f;
833 }
834 addDatum($hash, $prop, @fields);
835 }
836
837 else {
838 die "Error: Can't parse $_ in $filename";
839 }
840 }
841
842 $in->close();
843
844 # Script Qaac (Coptic) is a special case. Handle it here. See UTR#24:
845 # http://www.unicode.org/unicode/reports/tr24/
846 $hash->{'sc'}->{'Qaac'} = 'Coptic'
847 unless (exists $hash->{'sc'}->{'Qaac'});
848
849 # Add T|True and F|False -- these are values we recognize for
850 # binary properties (NOT from PropertyValueAliases.txt). These
851 # are of the same form as the 'ccc' value aliases.
852 $hash->{'binprop'}->{'0'} = 'F|False';
853 $hash->{'binprop'}->{'1'} = 'T|True';
854
855 $hash;
856 }
857
858 #----------------------------------------------------------------------
859 # Read the Blocks.txt file. Return a hash that maps the code point
860 # range start to the block name. The special key '_version' will map
861 # to the Unicode version of the file.
862 #
863 # As of Unicode 4.0, the names in the Blocks.txt are no longer the
864 # proper names. The proper names are now listed in PropertyValueAliases.
865 # They are similar but not identical. Furthermore, 4.0 introduces
866 # a new block name, No_Block, which is listed only in PropertyValueAliases
867 # and not in Blocks.txt. As a result, we handle blocks as follows:
868 #
869 # 1. Read Blocks.txt to map code point range start to quasi-block name.
870 # 2. Add to Blocks.txt a synthetic No Block code point & name:
871 # X -> No Block
872 # 3. Map quasi-names from Blocks.txt (including No Block) to actual
873 # names from PropertyValueAliases. This occurs in
874 # merge_PropertyValueAliases.
875 #
876 # @param a filename for Blocks.txt
877 #
878 # @return a ref to a hash. Keys are code points, as text, e.g.,
879 # "1720". Values are pseudo-block names, e.g., "Hanunoo".
880 sub read_Blocks {
881
882 my $filename = shift;
883
884 my $hash = {}; # result
885
886 my $in = new FileHandle($filename, 'r');
887 die "Error: Cannot open $filename" if (!defined $in);
888
889 while (<$in>) {
890
891 # Read version (embedded in a comment)
892 if (/Blocks-(\d+\.\d+\.\d+)/i) {
893 die "Error: Multiple versions in $filename"
894 if (exists $hash->{'_version'});
895 $hash->{'_version'} = $1;
896 }
897
898 # Ignore comments and blank lines
899 s/\#.*//;
900 next unless (/\S/);
901
902 if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) {
903 die "Error: Duplicate range $1 in $filename"
904 if (exists $hash->{$1});
905 $hash->{$1} = $2;
906 }
907
908 else {
909 die "Error: Can't parse $_ in $filename";
910 }
911 }
912
913 $in->close();
914
915 # Add pseudo-name for No Block
916 $hash->{'none'} = 'No Block';
917
918 $hash;
919 }
920
921 #----------------------------------------------------------------------
922 # Read the uscript.h file and compile a mapping of Unicode symbols to
923 # icu4c enum values.
924 #
925 # @param a filename for uscript.h
926 #
927 # @return a ref to a hash. The keys of the hash are enum symbols from
928 # uscript.h, and the values are script names.
929 sub read_uscript {
930
931 my $filename = shift;
932
933 my $mode = ''; # state machine mode and submode
934 my $submode = '';
935
936 my $last = ''; # for line folding
937
938 my $hash = {}; # result
939 my $key; # first-level key
940
941 my $in = new FileHandle($filename, 'r');
942 die "Error: Cannot open $filename" if (!defined $in);
943
944 while (<$in>) {
945 # Fold continued lines together
946 if (/^(.*)\\$/) {
947 $last = $1;
948 next;
949 } elsif ($last) {
950 $_ = $last . $_;
951 $last = '';
952 }
953
954 # Exit all modes here
955 if ($mode && $mode ne 'DEPRECATED') {
956 if (/^\s*\}/) {
957 $mode = '';
958 next;
959 }
960 }
961
962 # Handle individual modes
963
964 if ($mode eq 'UScriptCode') {
965 if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
966 my ($enum, $code) = ($1, $2);
967 die "Error: Duplicate script $enum"
968 if (exists $hash->{$enum});
969 $hash->{$enum} = $code;
970 }
971 }
972
973 elsif ($mode eq 'DEPRECATED') {
974 if (/\s*\#ifdef/) {
975 die "Error: Nested #ifdef";
976 }
977 elsif (/\s*\#endif/) {
978 $mode = '';
979 }
980 }
981
982 elsif (!$mode) {
983 if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
984 /^\s*typedef\s+enum\s+(\w+)\s*$/) {
985 $mode = $1;
986 #print "Parsing $mode\n";
987 }
988
989 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
990 $mode = 'DEPRECATED';
991 }
992 }
993 }
994
995 $in->close();
996
997 $hash;
998 }
999
1000 #----------------------------------------------------------------------
1001 # Read the uchar.h file and compile a mapping of Unicode symbols to
1002 # icu4c enum values.
1003 #
1004 # @param a filename for uchar.h
1005 #
1006 # @return a ref to a hash. The keys of the hash are '_bp' for binary
1007 # properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
1008 # double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
1009 # 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
1010 # value aliases. The values of the hash are subhashes. The subhashes
1011 # have a key of the uchar.h enum symbol, and a value of the alias
1012 # string (as listed in PropertyValueAliases.txt). NOTE: The alias
1013 # string is whatever alias uchar.h lists. This may be either short or
1014 # long, depending on the specific enum. NOTE: For blocks ('blk'), the
1015 # value is a hex code point for the start of the associated block.
1016 # NOTE: The special key _version will map to the Unicode version of
1017 # the file.
1018 sub read_uchar {
1019
1020 my $filename = shift;
1021
1022 my $mode = ''; # state machine mode and submode
1023 my $submode = '';
1024
1025 my $last = ''; # for line folding
1026
1027 my $hash = {}; # result
1028 my $key; # first-level key
1029
1030 my $in = new FileHandle($filename, 'r');
1031 die "Error: Cannot open $filename" if (!defined $in);
1032
1033 while (<$in>) {
1034 # Fold continued lines together
1035 if (/^(.*)\\$/) {
1036 $last .= $1;
1037 next;
1038 } elsif ($last) {
1039 $_ = $last . $_;
1040 $last = '';
1041 }
1042
1043 # Exit all modes here
1044 if ($mode && $mode ne 'DEPRECATED') {
1045 if (/^\s*\}/) {
1046 $mode = '';
1047 next;
1048 }
1049 }
1050
1051 # Handle individual modes
1052
1053 if ($mode eq 'UProperty') {
1054 if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
1055 if ($submode) {
1056 addDatum($hash, $key, $1, $submode);
1057 $submode = '';
1058 } else {
1059 #print "Warning: Ignoring $1\n";
1060 }
1061 }
1062
1063 elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
1064 die "Error: Unmatched tag $submode" if ($submode);
1065 die "Error: Unrecognized UProperty comment: $_"
1066 unless (exists $PROP_TYPE{$1});
1067 $key = $PROP_TYPE{$1};
1068 $submode = $2;
1069 }
1070 }
1071
1072 elsif ($mode eq 'UCharCategory') {
1073 if (/^\s*(U_\w+)\s*=/) {
1074 if ($submode) {
1075 addDatum($hash, 'gc', $1, $submode);
1076 $submode = '';
1077 } else {
1078 #print "Warning: Ignoring $1\n";
1079 }
1080 }
1081
1082 elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
1083 die "Error: Unmatched tag $submode" if ($submode);
1084 $submode = $1;
1085 }
1086 }
1087
1088 elsif ($mode eq 'UCharDirection') {
1089 if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
1090 if ($submode) {
1091 addDatum($hash, $key, $1, $submode);
1092 $submode = '';
1093 } else {
1094 #print "Warning: Ignoring $1\n";
1095 }
1096 }
1097
1098 elsif (m|/\*\*\s*([A-Z]+)\s|) {
1099 die "Error: Unmatched tag $submode" if ($submode);
1100 $key = 'bc';
1101 $submode = $1;
1102 }
1103 }
1104
1105 elsif ($mode eq 'UBlockCode') {
1106 if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
1107 addDatum($hash, 'blk', $1, $2);
1108 }
1109 }
1110
1111 elsif ($mode eq 'UEastAsianWidth') {
1112 if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
1113 addDatum($hash, 'ea', $1, $2);
1114 }
1115 }
1116
1117 elsif ($mode eq 'UDecompositionType') {
1118 if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
1119 addDatum($hash, 'dt', $1, $2);
1120 }
1121 }
1122
1123 elsif ($mode eq 'UJoiningType') {
1124 if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
1125 addDatum($hash, 'jt', $1, $2);
1126 }
1127 }
1128
1129 elsif ($mode eq 'UJoiningGroup') {
1130 if (/^\s*(U_JG_(\w+))/) {
1131 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
1132 }
1133 }
1134
1135 elsif ($mode eq 'ULineBreak') {
1136 if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
1137 addDatum($hash, 'lb', $1, $2);
1138 }
1139 }
1140
1141 elsif ($mode eq 'UNumericType') {
1142 if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
1143 addDatum($hash, 'nt', $1, $2);
1144 }
1145 }
1146
1147 elsif ($mode eq 'UHangulSyllableType') {
1148 if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
1149 addDatum($hash, 'hst', $1, $2);
1150 }
1151 }
1152
1153 elsif ($mode eq 'DEPRECATED') {
1154 if (/\s*\#ifdef/) {
1155 die "Error: Nested #ifdef";
1156 }
1157 elsif (/\s*\#endif/) {
1158 $mode = '';
1159 }
1160 }
1161
1162 elsif (!$mode) {
1163 if (/^\s*\#define\s+(\w+)\s+(.+)/) {
1164 # #define $left $right
1165 my ($left, $right) = ($1, $2);
1166
1167 if ($left eq 'U_UNICODE_VERSION') {
1168 my $version = $right;
1169 $version = $1 if ($version =~ /^\"(.*)\"/);
1170 # print "Unicode version: ", $version, "\n";
1171 die "Error: Multiple versions in $filename"
1172 if (defined $hash->{'_version'});
1173 $hash->{'_version'} = $version;
1174 }
1175
1176 elsif ($left =~ /U_GC_(\w+?)_MASK/) {
1177 addDatum($hash, 'gcm', $left, $1);
1178 }
1179 }
1180
1181 elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1182 /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1183 $mode = $1;
1184 #print "Parsing $mode\n";
1185 }
1186
1187 elsif (/^\s*enum\s+(\w+)\s*\{/ ||
1188 /^\s*enum\s+(\w+)\s*$/) {
1189 $mode = $1;
1190 #print "Parsing $mode\n";
1191 }
1192
1193 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1194 $mode = 'DEPRECATED';
1195 }
1196 }
1197 }
1198
1199 $in->close();
1200
1201 # hardcode known values for the normalization quick check properties
1202 # see unorm.h for the UNormalizationCheckResult enum
1203
1204 addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N');
1205 addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y');
1206 addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
1207
1208 addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N');
1209 addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y');
1210 addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
1211
1212 # no "maybe" values for NF[K]D
1213
1214 addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N');
1215 addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y');
1216
1217 addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N');
1218 addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y');
1219
1220 $hash;
1221 }
1222
1223 #----------------------------------------------------------------------
1224 # Add a new value to a two-level hash. That is, given a ref to
1225 # a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
1226 sub addDatum {
1227 my ($h, $k1, $k2, $v) = @_;
1228 if (exists $h->{$k1}->{$k2}) {
1229 die "Error: $k1:$k2 already set to " .
1230 $h->{$k1}->{$k2} . ", cannot set to " . $v;
1231 }
1232 $h->{$k1}->{$k2} = $v;
1233 }
1234
1235 #eof