]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/genpname/preparse.pl
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / tools / genpname / preparse.pl
... / ...
CommitLineData
1#!/bin/perl -w
2#*******************************************************************
3# COPYRIGHT:
4# Copyright (c) 2002-2006, International Business Machines Corporation and
5# others. All Rights Reserved.
6#*******************************************************************
7
8# This script reads in UCD files PropertyAliases.txt and
9# PropertyValueAliases.txt and correlates them with ICU enums
10# defined in uchar.h and uscript.h. It then outputs a header
11# file which contains all names and enums. The header is included
12# by the genpname tool C++ source file, which produces the actual
13# binary data file.
14#
15# See usage note below.
16#
17# TODO: The Property[Value]Alias.txt files state that they can support
18# more than 2 names per property|value. Currently (Unicode 3.2) there
19# are always 1 or 2 names. If more names were supported, presumably
20# the format would be something like:
21# nv ; Numeric_Value
22# nv ; Value_Numerique
23# CURRENTLY, this script assumes that there are 1 or two names. Any
24# duplicates it sees are flagged as an error. If multiple aliases
25# appear in a future version of Unicode, modify this script to support
26# that.
27#
28# NOTE: As of ICU 2.6, this script has been modified to know about the
29# pseudo-property gcm/General_Category_Mask, which corresponds to the
30# uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property
31# corresponds to General_Category but is a bitmask value. It does not
32# exist in the UCD. Therefore, I special case it in several places
33# (search for General_Category_Mask and gcm).
34#
35# NOTE: As of ICU 2.6, this script reads an auxiliary data file,
36# SyntheticPropertyAliases.txt, containing property aliases not
37# present in the UCD but present in ICU. This file resides in the
38# same directory as this script. Its contents are merged into those
39# of PropertyAliases.txt as if the two files were appended.
40#
41# NOTE: The following names are handled specially. See script below
42# for details.
43#
44# T/True
45# F/False
46# No_Block
47#
48# Author: Alan Liu
49# Created: October 14 2002
50# Since: ICU 2.4
51
52use FileHandle;
53use strict;
54use Dumpvalue;
55
56my $DEBUG = 1;
57my $DUMPER = new Dumpvalue;
58
59my $count = @ARGV;
60my $ICU_DIR = shift() || '';
61my $OUT_FILE = shift() || 'data.h';
62my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
63my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
64
65# Get the current year from the system
66my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
67
68# Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique
69my $propNA = 0;
70my $valueNA = 0;
71
72#----------------------------------------------------------------------
73# Top level property keys for binary, enumerated, string, and double props
74my @TOP = qw( _bp _ep _sp _dp _mp );
75
76# This hash governs how top level properties are grouped into output arrays.
77#my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ],
78# "NO_VALUE" => [ '_sp', '_dp' ] );m
79#my %TOP_PROPS = ( "BINARY" => [ '_bp' ],
80# "ENUMERATED" => [ '_ep' ],
81# "STRING" => [ '_sp' ],
82# "DOUBLE" => [ '_dp' ] );
83my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
84
85my %PROP_TYPE = (Binary => "_bp",
86 String => "_sp",
87 Double => "_dp",
88 Enumerated => "_ep",
89 Bitmask => "_mp");
90#----------------------------------------------------------------------
91
92# Properties that are unsupported in ICU
93my %UNSUPPORTED = (Composition_Exclusion => 1,
94 Decomposition_Mapping => 1,
95 Expands_On_NFC => 1,
96 Expands_On_NFD => 1,
97 Expands_On_NFKC => 1,
98 Expands_On_NFKD => 1,
99 FC_NFKC_Closure => 1,
100 ID_Start_Exceptions => 1,
101 Special_Case_Condition => 1,
102 );
103
104# Short names of properties that weren't seen in uchar.h. If the
105# properties weren't seen, don't complain about the property values
106# missing.
107my %MISSING_FROM_UCHAR;
108
109# Additional property aliases beyond short and long names,
110# like space in addition to WSpace and White_Space in Unicode 4.1.
111# Hashtable, maps long name to alias.
112# For example, maps White_Space->space.
113#
114# If multiple additional aliases are defined,
115# then they are separated in the value string with '|'.
116# For example, White_Space->space|outer_space
117my %additional_property_aliases;
118
119#----------------------------------------------------------------------
120
121# Emitted class names
122my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
123
124if ($count < 1 || $count > 2 ||
125 !-d $HEADER_DIR ||
126 !-d $UNIDATA_DIR) {
127 my $me = $0;
128 $me =~ s|.+[/\\]||;
129 my $lm = ' ' x length($me);
130 print <<"END";
131
132$me: Reads ICU4C headers and Unicode data files and creates
133$lm a C header file that is included by genpname. The header
134$lm file matches constants defined in the ICU4C headers with
135$lm property|value aliases in the Unicode data files.
136
137Usage: $me <icu_dir> [<out_file>]
138
139<icu_dir> ICU4C root directory, containing
140 source/common/unicode/uchar.h
141 source/common/unicode/uscript.h
142 source/data/unidata/Blocks.txt
143 source/data/unidata/PropertyAliases.txt
144 source/data/unidata/PropertyValueAliases.txt
145<out_file> File name of header to be written;
146 default is 'data.h'.
147
148The Unicode versions of all input files must match.
149END
150 exit(1);
151}
152
153my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
154
155if ($DEBUG) {
156 print "Merged hash:\n";
157 for my $key (sort keys %$h) {
158 my $hh = $h->{$key};
159 for my $subkey (sort keys %$hh) {
160 print "$key:$subkey:", $hh->{$subkey}, "\n";
161 }
162 }
163}
164
165my $out = new FileHandle($OUT_FILE, 'w');
166die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
167my $save = select($out);
168formatData($h, $version);
169select($save);
170$out->close();
171
172exit(0);
173
174#----------------------------------------------------------------------
175# From PropList.html: "The properties of the form Other_XXX
176# are used to generate properties in DerivedCoreProperties.txt.
177# They are not intended for general use, such as in APIs that
178# return property values.
179# Non_Break is not a valid property as of 3.2.
180sub isIgnoredProperty {
181 local $_ = shift;
182 /^Other_/i || /^Non_Break$/i;
183}
184
185# 'qc' is a pseudo-property matching any quick-check property
186# see PropertyValueAliases.txt file comments. 'binprop' is
187# a synthetic binary value alias "True"/"False", not present
188# in PropertyValueAliases.txt.
189sub isPseudoProperty {
190 $_[0] eq 'qc' ||
191 $_[0] eq 'binprop';
192}
193
194#----------------------------------------------------------------------
195# Emit the combined data from headers and the Unicode database as a
196# C source code header file.
197#
198# @param ref to hash with the data
199# @param Unicode version, as a string
200sub formatData {
201 my $h = shift;
202 my $version = shift;
203
204 my $date = scalar localtime();
205 print <<"END";
206/**
207 * Copyright (C) 2002-$YEAR, International Business Machines Corporation and
208 * others. All Rights Reserved.
209 *
210 * MACHINE GENERATED FILE. !!! Do not edit manually !!!
211 *
212 * Generated from
213 * uchar.h
214 * uscript.h
215 * Blocks.txt
216 * PropertyAliases.txt
217 * PropertyValueAliases.txt
218 *
219 * Date: $date
220 * Unicode version: $version
221 * Script: $0
222 */
223
224END
225
226 #------------------------------------------------------------
227 # Emit Unicode version
228 print "/* Unicode version $version */\n";
229 my @v = split(/\./, $version);
230 push @v, '0' while (@v < 4);
231 for (my $i=0; $i<@v; ++$i) {
232 print "const uint8_t VERSION_$i = $v[$i];\n";
233 }
234 print "\n";
235
236 #------------------------------------------------------------
237 # Emit String table
238 # [A table of all identifiers, that is, all long or short property
239 # or value names. The list need NOT be sorted; it will be sorted
240 # by the C program. Strings are referenced by their index into
241 # this table. After sorting, a REMAP[] array is used to map the
242 # old position indices to the new positions.]
243 my %strings;
244 for my $prop (sort keys %$h) {
245 my $hh = $h->{$prop};
246 for my $enum (sort keys %$hh) {
247 my @a = split(/\|/, $hh->{$enum});
248 for (@a) {
249 $strings{$_} = 1 if (length($_));
250 }
251 }
252 }
253 my @strings = sort keys %strings;
254 unshift @strings, "";
255
256 print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";
257
258 # while printing, create a mapping hash from string table entry to index
259 my %stringToID;
260 print "/* to be sorted */\n";
261 print "const $STRING_CLASS STRING_TABLE[] = {\n";
262 for (my $i=0; $i<@strings; ++$i) {
263 print " $STRING_CLASS(\"$strings[$i]\", $i),\n";
264 $stringToID{$strings[$i]} = $i;
265 }
266 print "};\n\n";
267
268 # placeholder for the remapping index. this is used to map
269 # indices that we compute here to indices of the sorted
270 # STRING_TABLE. STRING_TABLE will be sorted by the C++ program
271 # using the uprv_comparePropertyNames() function. this will
272 # reshuffle the order. we then use the indices (passed to the
273 # String constructor) to create a REMAP[] array.
274 print "/* to be filled in */\n";
275 print "int32_t REMAP[", scalar @strings, "];\n\n";
276
277 #------------------------------------------------------------
278 # Emit the name group table
279 # [A table of name groups. A name group is one or more names
280 # for a property or property value. The Unicode data files specify
281 # that there may be more than 2, although as of Unicode 3.2 there
282 # are at most 2. The name group table looks like this:
283 #
284 # 114, -115, 116, -117, 0, -118, 65, -64, ...
285 # [0] [2] [4] [6]
286 #
287 # The entry at [0] consists of 2 strings, 114 and 115.
288 # The entry at [2] consists of 116 and 117. The entry at
289 # [4] is one string, 118. There is always at least one
290 # string; typically there are two. If there are two, the first
291 # is the SHORT name and the second is the LONG. If there is
292 # one, then the missing entry (always the short name, in 3.2)
293 # is zero, which is by definition the index of "". The
294 # 'preferred' name will generally be the LONG name, if there are
295 # more than 2 entries. The last entry is negative.
296
297 # Build name group list and replace string refs with nameGroup indices
298 my @nameGroups;
299
300 # Check for duplicate name groups, and reuse them if possible
301 my %groupToInt; # Map group strings to ints
302 for my $prop (sort keys %$h) {
303 my $hh = $h->{$prop};
304 for my $enum (sort keys %$hh) {
305 my $groupString = $hh->{$enum};
306 my $i;
307 if (exists $groupToInt{$groupString}) {
308 $i = $groupToInt{$groupString};
309 } else {
310 my @names = split(/\|/, $groupString);
311 die "Error: Wrong number of names in " . $groupString if (@names < 1);
312 $i = @nameGroups; # index of group we are making
313 $groupToInt{$groupString} = $i; # Cache for reuse
314 push @nameGroups, map { $stringToID{$_} } @names;
315 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
316 }
317 # now, replace string list with ref to name group
318 $hh->{$enum} = $i;
319 }
320 }
321
322 print "const int32_t NAME_GROUP_COUNT = ",
323 scalar @nameGroups, ";\n\n";
324
325 print "int32_t NAME_GROUP[] = {\n";
326 # emit one group per line, with annotations
327 my $max_names = 0;
328 for (my $i=0; $i<@nameGroups; ) {
329 my @a;
330 my $line;
331 my $start = $i;
332 for (;;) {
333 my $j = $nameGroups[$i++];
334 $line .= "$j, ";
335 push @a, abs($j);
336 last if ($j < 0);
337 }
338 print " ",
339 $line,
340 ' 'x(20-length($line)),
341 "/* ", sprintf("%3d", $start),
342 ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
343 $max_names = @a if(@a > $max_names);
344
345 }
346 print "};\n\n";
347
348 # This is fixed for 3.2 at "2" but should be calculated dynamically
349 # when more than 2 names appear in Property[Value]Aliases.txt.
350 print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
351
352 #------------------------------------------------------------
353 # Emit enumerated property values
354 for my $prop (sort keys %$h) {
355 next if ($prop =~ /^_/);
356 my $vh = $h->{$prop};
357 my $count = scalar keys %$vh;
358
359 print "const int32_t VALUES_${prop}_COUNT = ",
360 $count, ";\n\n";
361
362 print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
363 for my $enum (sort keys %$vh) {
364 #my @names = split(/\|/, $vh->{$enum});
365 #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
366 # if (@names != 2);
367 print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
368 #$stringToID{$names[0]}, ", ",
369 #$stringToID{$names[1]}, "),\n";
370 # "\"", $names[0], "\", ",
371 # "\"", $names[1], "\"),\n";
372 }
373 print "};\n\n";
374 }
375
376 #------------------------------------------------------------
377 # Emit top-level properties (binary, enumerated, etc.)
378 for my $topName (sort keys %TOP_PROPS) {
379 my $a = $TOP_PROPS{$topName};
380 my $count = 0;
381 for my $type (@$a) { # "_bp", "_ep", etc.
382 $count += scalar keys %{$h->{$type}};
383 }
384
385 print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
386
387 print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
388
389 for my $type (@$a) { # "_bp", "_ep", etc.
390 my $p = $h->{$type};
391
392 for my $enum (sort keys %$p) {
393 my $name = $strings[$nameGroups[$p->{$enum}]];
394
395 my $valueRef = "0, NULL";
396 if ($type eq '_bp') {
397 $valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
398 }
399 elsif (exists $h->{$name}) {
400 $valueRef = "VALUES_${name}_COUNT, VALUES_$name";
401 }
402
403 print " $PROPERTY_CLASS((int32_t) $enum, ",
404 $p->{$enum}, ", $valueRef),\n";
405 }
406 }
407 print "};\n\n";
408 }
409
410 print "/*eof*/\n";
411}
412
413#----------------------------------------------------------------------
414# Read in the files uchar.h, uscript.h, Blocks.txt,
415# PropertyAliases.txt, and PropertyValueAliases.txt,
416# and combine them into one hash.
417#
418# @param directory containing headers
419# @param directory containin Unicode data files
420#
421# @return hash ref, Unicode version
422sub readAndMerge {
423
424 my ($headerDir, $unidataDir) = @_;
425
426 my $h = read_uchar("$headerDir/uchar.h");
427 my $s = read_uscript("$headerDir/uscript.h");
428 my $b = read_Blocks("$unidataDir/Blocks.txt");
429 my $pa = {};
430 read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
431 read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
432 my $va = {};
433 read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt");
434 read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt");
435
436 # Extract property family hash
437 my $fam = $pa->{'_family'};
438 delete $pa->{'_family'};
439
440 # Note: uscript.h has no version string, so don't check it
441 my $version = check_versions([ 'uchar.h', $h ],
442 [ 'Blocks.txt', $b ],
443 [ 'PropertyAliases.txt', $pa ],
444 [ 'PropertyValueAliases.txt', $va ]);
445
446 # Do this BEFORE merging; merging modifies the hashes
447 check_PropertyValueAliases($pa, $va);
448
449 # Dump out the $va hash for debugging
450 if ($DEBUG) {
451 print "Property values hash:\n";
452 for my $key (sort keys %$va) {
453 my $hh = $va->{$key};
454 for my $subkey (sort keys %$hh) {
455 print "$key:$subkey:", $hh->{$subkey}, "\n";
456 }
457 }
458 }
459
460 # Dump out the $s hash for debugging
461 if ($DEBUG) {
462 print "Script hash:\n";
463 for my $key (sort keys %$s) {
464 print "$key:", $s->{$key}, "\n";
465 }
466 }
467
468 # Link in the script data
469 $h->{'sc'} = $s;
470
471 merge_Blocks($h, $b);
472
473 merge_PropertyAliases($h, $pa, $fam);
474
475 merge_PropertyValueAliases($h, $va);
476
477 ($h, $version);
478}
479
480#----------------------------------------------------------------------
481# Ensure that the version strings in the given hashes (under the key
482# '_version') are compatible. Currently this means they must be
483# identical, with the exception that "X.Y" will match "X.Y.0".
484# All hashes must define the key '_version'.
485#
486# @param a list of pairs of (file name, hash reference)
487#
488# @return the version of all the hashes. Upon return, the '_version'
489# will be removed from all hashes.
490sub check_versions {
491 my $version = '';
492 my $msg = '';
493 foreach my $a (@_) {
494 my $name = $a->[0];
495 my $h = $a->[1];
496 die "Error: No version found" unless (exists $h->{'_version'});
497 my $v = $h->{'_version'};
498 delete $h->{'_version'};
499
500 # append ".0" if necessary, to standardize to X.Y.Z
501 $v .= '.0' unless ($v =~ /\.\d+\./);
502 $v .= '.0' unless ($v =~ /\.\d+\./);
503 $msg .= "$name = $v\n";
504 if ($version) {
505 die "Error: Mismatched Unicode versions\n$msg"
506 unless ($version eq $v);
507 } else {
508 $version = $v;
509 }
510 }
511 $version;
512}
513
514#----------------------------------------------------------------------
515# Make sure the property names in PropertyValueAliases.txt match those
516# in PropertyAliases.txt.
517#
518# @param a hash ref from read_PropertyAliases.
519# @param a hash ref from read_PropertyValueAliases.
520sub check_PropertyValueAliases {
521 my ($pa, $va) = @_;
522
523 # make a reverse hash of short->long
524 my %rev;
525 for (keys %$pa) { $rev{$pa->{$_}} = $_; }
526
527 for my $prop (keys %$va) {
528 if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
529 print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
530 }
531 }
532}
533
534#----------------------------------------------------------------------
535# Merge blocks data into uchar.h enum data. In the 'blk' subhash all
536# code point values, as returned from read_uchar, are replaced by
537# block names, as read from Blocks.txt and returned by read_Blocks.
538# The match must be 1-to-1. If there is any failure of 1-to-1
539# mapping, an error is signaled. Upon return, the read_Blocks hash
540# is emptied of all contents, except for those that failed to match.
541#
542# The mapping in the 'blk' subhash, after this function returns, is
543# from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
544# pseudo-name, e.g. "Basic Latin".
545#
546# @param a hash ref from read_uchar.
547# @param a hash ref from read_Blocks.
548sub merge_Blocks {
549 my ($h, $b) = @_;
550
551 die "Error: No blocks data in uchar.h"
552 unless (exists $h->{'blk'});
553 my $blk = $h->{'blk'};
554 for my $enum (keys %$blk) {
555 my $cp = $blk->{$enum};
556 if ($cp && !exists $b->{$cp}) {
557 die "Error: No block found at $cp in Blocks.txt";
558 }
559 # Convert code point to pseudo-name:
560 $blk->{$enum} = $b->{$cp};
561 delete $b->{$cp};
562 }
563 my $err = '';
564 for my $cp (keys %$b) {
565 $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
566 }
567 die $err if ($err);
568}
569
570#----------------------------------------------------------------------
571# Merge property alias names into the uchar.h hash. The subhashes
572# under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
573# examined and the values of those subhashes are assumed to be long
574# names in PropertyAliases.txt. They are validated and replaced by
575# "<short>|<long>". Upon return, the read_PropertyAliases hash is
576# emptied of all contents, except for those that failed to match.
577# Unmatched names in PropertyAliases are listed as a warning but do
578# NOT cause the script to die.
579#
580# @param a hash ref from read_uchar.
581# @param a hash ref from read_PropertyAliases.
582# @param a hash mapping long names to property family (e.g., 'binary')
583sub merge_PropertyAliases {
584 my ($h, $pa, $fam) = @_;
585
586 for my $k (@TOP) {
587 die "Error: No properties data for $k in uchar.h"
588 unless (exists $h->{$k});
589 }
590
591 for my $subh (map { $h->{$_} } @TOP) {
592 for my $enum (keys %$subh) {
593 my $long_name = $subh->{$enum};
594 if (!exists $pa->{$long_name}) {
595 die "Error: Property $long_name not found (or used more than once)";
596 }
597
598 my $value;
599 if($pa->{$long_name} =~ m|^n/a\d*$|) {
600 # replace an "n/a" short name with an empty name (nothing before "|");
601 # don't remove it (don't remove the "|"): there must always be a long name,
602 # and if the short name is removed, then the long name becomes the
603 # short name and there is no long name left (unless there is another alias)
604 $value = "|" . $long_name;
605 } else {
606 $value = $pa->{$long_name} . "|" . $long_name;
607 }
608 if (exists $additional_property_aliases{$long_name}) {
609 $value .= "|" . $additional_property_aliases{$long_name};
610 }
611 $subh->{$enum} = $value;
612 delete $pa->{$long_name};
613 }
614 }
615
616 my @err;
617 for my $name (keys %$pa) {
618 $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
619 if (exists $UNSUPPORTED{$name}) {
620 push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
621 } elsif (!isIgnoredProperty($name)) {
622 push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
623 }
624 }
625 print join("\n", sort @err), "\n" if (@err);
626}
627
628#----------------------------------------------------------------------
629# Return 1 if two names match ignoring whitespace, '-', and '_'.
630# Used to match names in Blocks.txt with those in PropertyValueAliases.txt
631# as of Unicode 4.0.
632sub matchesLoosely {
633 my ($a, $b) = @_;
634 $a =~ s/[\s\-_]//g;
635 $b =~ s/[\s\-_]//g;
636 $a =~ /^$b$/i;
637}
638
639#----------------------------------------------------------------------
640# Merge PropertyValueAliases.txt data into the uchar.h hash. All
641# properties other than blk, _bp, and _ep are analyzed and mapped to
642# the names listed in PropertyValueAliases. They are then replaced
643# with a string of the form "<short>|<long>". The short or long name
644# may be missing.
645#
646# @param a hash ref from read_uchar.
647# @param a hash ref from read_PropertyValueAliases.
648sub merge_PropertyValueAliases {
649 my ($h, $va) = @_;
650
651 my %gcCount;
652 for my $prop (keys %$h) {
653 # _bp, _ep handled in merge_PropertyAliases
654 next if ($prop =~ /^_/);
655
656 # Special case: gcm
657 my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
658
659 # find corresponding PropertyValueAliases data
660 die "Error: Can't find $prop in PropertyValueAliases.txt"
661 unless (exists $va->{$prop2});
662 my $pva = $va->{$prop2};
663
664 # match up data
665 my $hh = $h->{$prop};
666 for my $enum (keys %$hh) {
667
668 my $name = $hh->{$enum};
669
670 # look up both long and short & ignore case
671 my $n;
672 if (exists $pva->{$name}) {
673 $n = $name;
674 } else {
675 # iterate (slow)
676 for my $a (keys %$pva) {
677 # case-insensitive match
678 # & case-insensitive reverse match
679 if ($a =~ /^$name$/i ||
680 $pva->{$a} =~ /^$name$/i) {
681 $n = $a;
682 last;
683 }
684 }
685 }
686
687 # For blocks, do a loose match from Blocks.txt pseudo-name
688 # to PropertyValueAliases long name.
689 if (!$n && $prop eq 'blk') {
690 for my $a (keys %$pva) {
691 # The block is only going to match the long name,
692 # but we check both for completeness. As of Unicode
693 # 4.0, blocks do not have short names.
694 if (matchesLoosely($name, $pva->{$a}) ||
695 matchesLoosely($name, $a)) {
696 $n = $a;
697 last;
698 }
699 }
700 }
701
702 die "Error: Property value $prop:$name not found" unless ($n);
703
704 my $l = $n;
705 my $r = $pva->{$n};
706 # convert |n/a\d*| to blank
707 $l = '' if ($l =~ m|^n/a\d*$|);
708 $r = '' if ($r =~ m|^n/a\d*$|);
709
710 $hh->{$enum} = "$l|$r";
711 # Don't delete the 'gc' properties because we need to share
712 # them between 'gc' and 'gcm'. Count each use instead.
713 if ($prop2 eq 'gc') {
714 ++$gcCount{$n};
715 } else {
716 delete $pva->{$n};
717 }
718 }
719 }
720
721 # Merge the combining class values in manually
722 # Add the same values to the synthetic lccc and tccc properties
723 die "Error: No ccc data"
724 unless exists $va->{'ccc'};
725 for my $ccc (keys %{$va->{'ccc'}}) {
726 die "Error: Can't overwrite ccc $ccc"
727 if (exists $h->{'ccc'}->{$ccc});
728 $h->{'lccc'}->{$ccc} =
729 $h->{'tccc'}->{$ccc} =
730 $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
731 }
732 delete $va->{'ccc'};
733
734 # Merge synthetic binary property values in manually.
735 # These are the "True" and "False" value aliases.
736 die "Error: No True/False value aliases"
737 unless exists $va->{'binprop'};
738 for my $bp (keys %{$va->{'binprop'}}) {
739 $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
740 }
741 delete $va->{'binprop'};
742
743 my $err = '';
744 for my $prop (sort keys %$va) {
745 my $hh = $va->{$prop};
746 for my $subkey (sort keys %$hh) {
747 # 'gc' props are shared with 'gcm'; make sure they were used
748 # once or twice.
749 if ($prop eq 'gc') {
750 my $n = $gcCount{$subkey};
751 next if ($n >= 1 && $n <= 2);
752 }
753 $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
754 unless exists $MISSING_FROM_UCHAR{$prop};
755 }
756 }
757 print $err if ($err);
758}
759
760#----------------------------------------------------------------------
761# Read the PropertyAliases.txt file. Return a hash that maps the long
762# name to the short name. The special key '_version' will map to the
763# Unicode version of the file. The special key '_family' holds a
764# subhash that maps long names to a family string, for descriptive
765# purposes.
766#
767# @param a filename for PropertyAliases.txt
768# @param reference to hash to receive data. Keys are long names.
769# Values are short names.
770sub read_PropertyAliases {
771
772 my $hash = shift; # result
773
774 my $filename = shift;
775
776 my $fam = {}; # map long names to family string
777 $fam = $hash->{'_family'} if (exists $hash->{'_family'});
778
779 my $family; # binary, enumerated, etc.
780
781 my $in = new FileHandle($filename, 'r');
782 die "Error: Cannot open $filename" if (!defined $in);
783
784 while (<$in>) {
785
786 # Read version (embedded in a comment)
787 if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
788 die "Error: Multiple versions in $filename"
789 if (exists $hash->{'_version'});
790 $hash->{'_version'} = $1;
791 }
792
793 # Read family heading
794 if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
795 $family = $1;
796 }
797
798 # Ignore comments and blank lines
799 s/\#.*//;
800 next unless (/\S/);
801
802 if (/^\s*(.+?)\s*;/) {
803 my $short = $1;
804 my @fields = /;\s*([^\s;]+)/g;
805 if (@fields < 1 || @fields > 2) {
806 my $number = @fields;
807 die "Error: Wrong number of fields ($number) in $filename at $_";
808 }
809
810 # Make "n/a" strings unique
811 if ($short eq 'n/a') {
812 $short .= sprintf("%03d", $propNA++);
813 }
814 my $long = $fields[0];
815 if ($long eq 'n/a') {
816 $long .= sprintf("%03d", $propNA++);
817 }
818
819 # Add long name->short name to the hash=pa hash table
820 if (exists $hash->{$long}) {
821 die "Error: Duplicate property $long in $filename"
822 }
823 $hash->{$long} = $short;
824 $fam->{$long} = $family;
825
826 # Add the list of further aliases to the additional_property_aliases hash table,
827 # using the long property name as the key.
828 # For example:
829 # White_Space->space|outer_space
830 if (@fields > 1) {
831 my $value = pop @fields;
832 while (@fields > 1) {
833 $value .= "|" . pop @fields;
834 }
835 $additional_property_aliases{$long} = $value;
836 }
837 } else {
838 die "Error: Can't parse $_ in $filename";
839 }
840 }
841
842 $in->close();
843
844 $hash->{'_family'} = $fam;
845}
846
847#----------------------------------------------------------------------
848# Read the PropertyValueAliases.txt file. Return a two level hash
849# that maps property_short_name:value_short_name:value_long_name. In
850# the case of the 'ccc' property, the short name is the numeric class
851# and the long name is "<short>|<long>". The special key '_version'
852# will map to the Unicode version of the file.
853#
854# @param a filename for PropertyValueAliases.txt
855#
856# @return a hash reference.
857sub read_PropertyValueAliases {
858
859 my $hash = shift; # result
860
861 my $filename = shift;
862
863 my $in = new FileHandle($filename, 'r');
864 die "Error: Cannot open $filename" if (!defined $in);
865
866 while (<$in>) {
867
868 # Read version (embedded in a comment)
869 if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
870 die "Error: Multiple versions in $filename"
871 if (exists $hash->{'_version'});
872 $hash->{'_version'} = $1;
873 }
874
875 # Ignore comments and blank lines
876 s/\#.*//;
877 next unless (/\S/);
878
879 if (/^\s*(.+?)\s*;/i) {
880 my $prop = $1;
881 my @fields = /;\s*([^\s;]+)/g;
882 die "Error: Wrong number of fields in $filename"
883 if (@fields < 2 || @fields > 3);
884 # Make "n/a" strings unique
885 $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
886 # Squash extra fields together
887 while (@fields > 2) {
888 my $f = pop @fields;
889 $fields[$#fields] .= '|' . $f;
890 }
891 addDatum($hash, $prop, @fields);
892 }
893
894 else {
895 die "Error: Can't parse $_ in $filename";
896 }
897 }
898
899 $in->close();
900
901 # Script Copt=Qaac (Coptic) is a special case.
902 # Before the Copt code was defined, the private-use code Qaac was used.
903 # Starting with Unicode 4.1, PropertyValueAliases.txt contains
904 # Copt as the short name as well as Qaac as an alias.
905 # For use with older Unicode data files, we add here a Qaac->Coptic entry.
906 # This should not do anything for 4.1-and-later Unicode data files.
907 # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/
908 $hash->{'sc'}->{'Qaac'} = 'Coptic'
909 unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'});
910
911 # Add T|True and F|False -- these are values we recognize for
912 # binary properties (NOT from PropertyValueAliases.txt). These
913 # are of the same form as the 'ccc' value aliases.
914 $hash->{'binprop'}->{'0'} = 'F|False';
915 $hash->{'binprop'}->{'1'} = 'T|True';
916}
917
918#----------------------------------------------------------------------
919# Read the Blocks.txt file. Return a hash that maps the code point
920# range start to the block name. The special key '_version' will map
921# to the Unicode version of the file.
922#
923# As of Unicode 4.0, the names in the Blocks.txt are no longer the
924# proper names. The proper names are now listed in PropertyValueAliases.
925# They are similar but not identical. Furthermore, 4.0 introduces
926# a new block name, No_Block, which is listed only in PropertyValueAliases
927# and not in Blocks.txt. As a result, we handle blocks as follows:
928#
929# 1. Read Blocks.txt to map code point range start to quasi-block name.
930# 2. Add to Blocks.txt a synthetic No Block code point & name:
931# X -> No Block
932# 3. Map quasi-names from Blocks.txt (including No Block) to actual
933# names from PropertyValueAliases. This occurs in
934# merge_PropertyValueAliases.
935#
936# @param a filename for Blocks.txt
937#
938# @return a ref to a hash. Keys are code points, as text, e.g.,
939# "1720". Values are pseudo-block names, e.g., "Hanunoo".
940sub read_Blocks {
941
942 my $filename = shift;
943
944 my $hash = {}; # result
945
946 my $in = new FileHandle($filename, 'r');
947 die "Error: Cannot open $filename" if (!defined $in);
948
949 while (<$in>) {
950
951 # Read version (embedded in a comment)
952 if (/Blocks-(\d+\.\d+\.\d+)/i) {
953 die "Error: Multiple versions in $filename"
954 if (exists $hash->{'_version'});
955 $hash->{'_version'} = $1;
956 }
957
958 # Ignore comments and blank lines
959 s/\#.*//;
960 next unless (/\S/);
961
962 if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) {
963 die "Error: Duplicate range $1 in $filename"
964 if (exists $hash->{$1});
965 $hash->{$1} = $2;
966 }
967
968 else {
969 die "Error: Can't parse $_ in $filename";
970 }
971 }
972
973 $in->close();
974
975 # Add pseudo-name for No Block
976 $hash->{'none'} = 'No Block';
977
978 $hash;
979}
980
981#----------------------------------------------------------------------
982# Read the uscript.h file and compile a mapping of Unicode symbols to
983# icu4c enum values.
984#
985# @param a filename for uscript.h
986#
987# @return a ref to a hash. The keys of the hash are enum symbols from
988# uscript.h, and the values are script names.
989sub read_uscript {
990
991 my $filename = shift;
992
993 my $mode = ''; # state machine mode and submode
994 my $submode = '';
995
996 my $last = ''; # for line folding
997
998 my $hash = {}; # result
999 my $key; # first-level key
1000
1001 my $in = new FileHandle($filename, 'r');
1002 die "Error: Cannot open $filename" if (!defined $in);
1003
1004 while (<$in>) {
1005 # Fold continued lines together
1006 if (/^(.*)\\$/) {
1007 $last = $1;
1008 next;
1009 } elsif ($last) {
1010 $_ = $last . $_;
1011 $last = '';
1012 }
1013
1014 # Exit all modes here
1015 if ($mode && $mode ne 'DEPRECATED') {
1016 if (/^\s*\}/) {
1017 $mode = '';
1018 next;
1019 }
1020 }
1021
1022 # Handle individual modes
1023
1024 if ($mode eq 'UScriptCode') {
1025 if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
1026 my ($enum, $code) = ($1, $2);
1027 die "Error: Duplicate script $enum"
1028 if (exists $hash->{$enum});
1029 $hash->{$enum} = $code;
1030 }
1031 }
1032
1033 elsif ($mode eq 'DEPRECATED') {
1034 if (/\s*\#ifdef/) {
1035 die "Error: Nested #ifdef";
1036 }
1037 elsif (/\s*\#endif/) {
1038 $mode = '';
1039 }
1040 }
1041
1042 elsif (!$mode) {
1043 if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1044 /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1045 $mode = $1;
1046 #print "Parsing $mode\n";
1047 }
1048
1049 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1050 $mode = 'DEPRECATED';
1051 }
1052 }
1053 }
1054
1055 $in->close();
1056
1057 $hash;
1058}
1059
1060#----------------------------------------------------------------------
1061# Read the uchar.h file and compile a mapping of Unicode symbols to
1062# icu4c enum values.
1063#
1064# @param a filename for uchar.h
1065#
1066# @return a ref to a hash. The keys of the hash are '_bp' for binary
1067# properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
1068# double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
1069# 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
1070# value aliases. The values of the hash are subhashes. The subhashes
1071# have a key of the uchar.h enum symbol, and a value of the alias
1072# string (as listed in PropertyValueAliases.txt). NOTE: The alias
1073# string is whatever alias uchar.h lists. This may be either short or
1074# long, depending on the specific enum. NOTE: For blocks ('blk'), the
1075# value is a hex code point for the start of the associated block.
1076# NOTE: The special key _version will map to the Unicode version of
1077# the file.
1078sub read_uchar {
1079
1080 my $filename = shift;
1081
1082 my $mode = ''; # state machine mode and submode
1083 my $submode = '';
1084
1085 my $last = ''; # for line folding
1086
1087 my $hash = {}; # result
1088 my $key; # first-level key
1089
1090 my $in = new FileHandle($filename, 'r');
1091 die "Error: Cannot open $filename" if (!defined $in);
1092
1093 while (<$in>) {
1094 # Fold continued lines together
1095 if (/^(.*)\\$/) {
1096 $last .= $1;
1097 next;
1098 } elsif ($last) {
1099 $_ = $last . $_;
1100 $last = '';
1101 }
1102
1103 # Exit all modes here
1104 if ($mode && $mode ne 'DEPRECATED') {
1105 if (/^\s*\}/) {
1106 $mode = '';
1107 next;
1108 }
1109 }
1110
1111 # Handle individual modes
1112
1113 if ($mode eq 'UProperty') {
1114 if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
1115 if ($submode) {
1116 addDatum($hash, $key, $1, $submode);
1117 $submode = '';
1118 } else {
1119 #print "Warning: Ignoring $1\n";
1120 }
1121 }
1122
1123 elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
1124 die "Error: Unmatched tag $submode" if ($submode);
1125 die "Error: Unrecognized UProperty comment: $_"
1126 unless (exists $PROP_TYPE{$1});
1127 $key = $PROP_TYPE{$1};
1128 $submode = $2;
1129 }
1130 }
1131
1132 elsif ($mode eq 'UCharCategory') {
1133 if (/^\s*(U_\w+)\s*=/) {
1134 if ($submode) {
1135 addDatum($hash, 'gc', $1, $submode);
1136 $submode = '';
1137 } else {
1138 #print "Warning: Ignoring $1\n";
1139 }
1140 }
1141
1142 elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
1143 die "Error: Unmatched tag $submode" if ($submode);
1144 $submode = $1;
1145 }
1146 }
1147
1148 elsif ($mode eq 'UCharDirection') {
1149 if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
1150 if ($submode) {
1151 addDatum($hash, $key, $1, $submode);
1152 $submode = '';
1153 } else {
1154 #print "Warning: Ignoring $1\n";
1155 }
1156 }
1157
1158 elsif (m|/\*\*\s*([A-Z]+)\s|) {
1159 die "Error: Unmatched tag $submode" if ($submode);
1160 $key = 'bc';
1161 $submode = $1;
1162 }
1163 }
1164
1165 elsif ($mode eq 'UBlockCode') {
1166 if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
1167 addDatum($hash, 'blk', $1, $2);
1168 }
1169 }
1170
1171 elsif ($mode eq 'UEastAsianWidth') {
1172 if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
1173 addDatum($hash, 'ea', $1, $2);
1174 }
1175 }
1176
1177 elsif ($mode eq 'UDecompositionType') {
1178 if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
1179 addDatum($hash, 'dt', $1, $2);
1180 }
1181 }
1182
1183 elsif ($mode eq 'UJoiningType') {
1184 if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
1185 addDatum($hash, 'jt', $1, $2);
1186 }
1187 }
1188
1189 elsif ($mode eq 'UJoiningGroup') {
1190 if (/^\s*(U_JG_(\w+))/) {
1191 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
1192 }
1193 }
1194
1195 elsif ($mode eq 'UGraphemeClusterBreak') {
1196 if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) {
1197 addDatum($hash, 'GCB', $1, $2);
1198 }
1199 }
1200
1201 elsif ($mode eq 'UWordBreakValues') {
1202 if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) {
1203 addDatum($hash, 'WB', $1, $2);
1204 }
1205 }
1206
1207 elsif ($mode eq 'USentenceBreak') {
1208 if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) {
1209 addDatum($hash, 'SB', $1, $2);
1210 }
1211 }
1212
1213 elsif ($mode eq 'ULineBreak') {
1214 if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
1215 addDatum($hash, 'lb', $1, $2);
1216 }
1217 }
1218
1219 elsif ($mode eq 'UNumericType') {
1220 if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
1221 addDatum($hash, 'nt', $1, $2);
1222 }
1223 }
1224
1225 elsif ($mode eq 'UHangulSyllableType') {
1226 if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
1227 addDatum($hash, 'hst', $1, $2);
1228 }
1229 }
1230
1231 elsif ($mode eq 'DEPRECATED') {
1232 if (/\s*\#ifdef/) {
1233 die "Error: Nested #ifdef";
1234 }
1235 elsif (/\s*\#endif/) {
1236 $mode = '';
1237 }
1238 }
1239
1240 elsif (!$mode) {
1241 if (/^\s*\#define\s+(\w+)\s+(.+)/) {
1242 # #define $left $right
1243 my ($left, $right) = ($1, $2);
1244
1245 if ($left eq 'U_UNICODE_VERSION') {
1246 my $version = $right;
1247 $version = $1 if ($version =~ /^\"(.*)\"/);
1248 # print "Unicode version: ", $version, "\n";
1249 die "Error: Multiple versions in $filename"
1250 if (defined $hash->{'_version'});
1251 $hash->{'_version'} = $version;
1252 }
1253
1254 elsif ($left =~ /U_GC_(\w+?)_MASK/) {
1255 addDatum($hash, 'gcm', $left, $1);
1256 }
1257 }
1258
1259 elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1260 /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1261 $mode = $1;
1262 #print "Parsing $mode\n";
1263 }
1264
1265 elsif (/^\s*enum\s+(\w+)\s*\{/ ||
1266 /^\s*enum\s+(\w+)\s*$/) {
1267 $mode = $1;
1268 #print "Parsing $mode\n";
1269 }
1270
1271 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1272 $mode = 'DEPRECATED';
1273 }
1274 }
1275 }
1276
1277 $in->close();
1278
1279 # hardcode known values for the normalization quick check properties
1280 # see unorm.h for the UNormalizationCheckResult enum
1281
1282 addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N');
1283 addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y');
1284 addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
1285
1286 addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N');
1287 addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y');
1288 addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
1289
1290 # no "maybe" values for NF[K]D
1291
1292 addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N');
1293 addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y');
1294
1295 addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N');
1296 addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y');
1297
1298 $hash;
1299}
1300
1301#----------------------------------------------------------------------
1302# Add a new value to a two-level hash. That is, given a ref to
1303# a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
1304sub addDatum {
1305 my ($h, $k1, $k2, $v) = @_;
1306 if (exists $h->{$k1}->{$k2}) {
1307 die "Error: $k1:$k2 already set to " .
1308 $h->{$k1}->{$k2} . ", cannot set to " . $v;
1309 }
1310 $h->{$k1}->{$k2} = $v;
1311}
1312
1313#eof