]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genpname/preparse.pl
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / tools / genpname / preparse.pl
CommitLineData
b75a7d8f
A
1#!/bin/perl -w
2#*******************************************************************
3# COPYRIGHT:
4# Copyright (c) 2002-2003, International Business Machines Corporation and
5# others. All Rights Reserved.
6#*******************************************************************
7
8# This script reads in UCD files PropertyAliases.txt and
9# PropertyValueAliases.txt and correlates them with ICU enums
10# defined in uchar.h and uscript.h. It then outputs a header
11# file which contains all names and enums. The header is included
12# by the genpname tool C++ source file, which produces the actual
13# binary data file.
14#
15# See usage note below.
16#
17# TODO: The Property[Value]Alias.txt files state that they can support
18# more than 2 names per property|value. Currently (Unicode 3.2) there
19# are always 1 or 2 names. If more names were supported, presumably
20# the format would be something like:
21# nv ; Numeric_Value
22# nv ; Value_Numerique
23# CURRENTLY, this script assumes that there are 1 or two names. Any
24# duplicates it sees are flagged as an error. If multiple aliases
25# appear in a future version of Unicode, modify this script to support
26# that.
27#
28# NOTE: As of ICU 2.6, this script has been modified to know about the
29# pseudo-property gcm/General_Category_Mask, which corresponds to the
30# uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property
31# corresponds to General_Category but is a bitmask value. It does not
32# exist in the UCD. Therefore, I special case it in several places
33# (search for General_Category_Mask and gcm).
34#
35# NOTE: As of ICU 2.6, this script reads an auxiliary data file,
36# SyntheticPropertyAliases.txt, containing property aliases not
37# present in the UCD but present in ICU. This file resides in the
38# same directory as this script. Its contents are merged into those
39# of PropertyAliases.txt as if the two files were appended.
40#
41# NOTE: The following names are handled specially. See script below
42# for details.
43#
44# T/True
45# F/False
46# No_Block
47#
48# Author: Alan Liu
49# Created: October 14 2002
50# Since: ICU 2.4
51
52use FileHandle;
53use strict;
54use Dumpvalue;
55
56my $DEBUG = 0;
57my $DUMPER = new Dumpvalue;
58
59my $count = @ARGV;
60my $ICU_DIR = shift() || '';
61my $OUT_FILE = shift() || 'data.h';
62my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
63my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
64
65# Get the current year from the system
66my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
67
68#----------------------------------------------------------------------
69# Top level property keys for binary, enumerated, string, and double props
70my @TOP = qw( _bp _ep _sp _dp _mp );
71
72# This hash governs how top level properties are grouped into output arrays.
73#my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ],
74# "NO_VALUE" => [ '_sp', '_dp' ] );m
75#my %TOP_PROPS = ( "BINARY" => [ '_bp' ],
76# "ENUMERATED" => [ '_ep' ],
77# "STRING" => [ '_sp' ],
78# "DOUBLE" => [ '_dp' ] );
79my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
80
81my %PROP_TYPE = (Binary => "_bp",
82 String => "_sp",
83 Double => "_dp",
84 Enumerated => "_ep",
85 Bitmask => "_mp");
86#----------------------------------------------------------------------
87
88# Properties that are unsupported in ICU
89my %UNSUPPORTED = (Composition_Exclusion => 1,
90 Decomposition_Mapping => 1,
91 Expands_On_NFC => 1,
92 Expands_On_NFD => 1,
93 Expands_On_NFKC => 1,
94 Expands_On_NFKD => 1,
95 FC_NFKC_Closure => 1,
96 ID_Start_Exceptions => 1,
97 NFC_Quick_Check => 1,
98 NFD_Quick_Check => 1,
99 NFKC_Quick_Check => 1,
100 NFKD_Quick_Check => 1,
101 Special_Case_Condition => 1,
102 );
103
104# Short names of properties that weren't seen in uchar.h. If the
105# properties weren't seen, don't complain about the property values
106# missing.
107my %MISSING_FROM_UCHAR;
108
109#----------------------------------------------------------------------
110
111# Emitted class names
112my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
113
114if ($count < 1 || $count > 2 ||
115 !-d $HEADER_DIR ||
116 !-d $UNIDATA_DIR) {
117 my $me = $0;
118 $me =~ s|.+[/\\]||;
119 my $lm = ' ' x length($me);
120 print <<"END";
121
122$me: Reads ICU4C headers and Unicode data files and creates
123$lm a C header file that is included by genpname. The header
124$lm file matches constants defined in the ICU4C headers with
125$lm property|value aliases in the Unicode data files.
126
127Usage: $me <icu_dir> [<out_file>]
128
129<icu_dir> ICU4C root directory, containing
130 source/common/unicode/uchar.h
131 source/common/unicode/uscript.h
132 source/data/unidata/Blocks.txt
133 source/data/unidata/PropertyAliases.txt
134 source/data/unidata/PropertyValueAliases.txt
135<out_file> File name of header to be written;
136 default is 'data.h'.
137
138The Unicode versions of all input files must match.
139END
140 exit(1);
141}
142
143my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
144
145if ($DEBUG) {
146 print "Merged hash:\n";
147 for my $key (sort keys %$h) {
148 my $hh = $h->{$key};
149 for my $subkey (sort keys %$hh) {
150 print "$key:$subkey:", $hh->{$subkey}, "\n";
151 }
152 }
153}
154
155my $out = new FileHandle($OUT_FILE, 'w');
156die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
157my $save = select($out);
158formatData($h, $version);
159select($save);
160$out->close();
161
162exit(0);
163
164#----------------------------------------------------------------------
165# From PropList.html: "The properties of the form Other_XXX
166# are used to generate properties in DerivedCoreProperties.txt.
167# They are not intended for general use, such as in APIs that
168# return property values.
169# Non_Break is not a valid property as of 3.2.
170sub isIgnoredProperty {
171 local $_ = shift;
172 /^Other_/i || /^Non_Break$/i;
173}
174
175# 'qc' is a pseudo-property matching any quick-check property
176# see PropertyValueAliases.txt file comments. 'binprop' is
177# a synthetic binary value alias "True"/"False", not present
178# in PropertyValueAliases.txt.
179sub isPseudoProperty {
180 $_[0] eq 'qc' ||
181 $_[0] eq 'binprop';
182}
183
184#----------------------------------------------------------------------
185# Emit the combined data from headers and the Unicode database as a
186# C source code header file.
187#
188# @param ref to hash with the data
189# @param Unicode version, as a string
190sub formatData {
191 my $h = shift;
192 my $version = shift;
193
194 my $date = scalar localtime();
195 print <<"END";
196/**
197 * Copyright (C) 2002-$YEAR, International Business Machines Corporation and
198 * others. All Rights Reserved.
199 *
200 * MACHINE GENERATED FILE. !!! Do not edit manually !!!
201 *
202 * Generated from
203 * uchar.h
204 * uscript.h
205 * Blocks.txt
206 * PropertyAliases.txt
207 * PropertyValueAliases.txt
208 *
209 * Date: $date
210 * Unicode version: $version
211 * Script: $0
212 */
213
214END
215
216 #------------------------------------------------------------
217 # Emit Unicode version
218 print "/* Unicode version $version */\n";
219 my @v = split(/\./, $version);
220 push @v, '0' while (@v < 4);
221 for (my $i=0; $i<@v; ++$i) {
222 print "const uint8_t VERSION_$i = $v[$i];\n";
223 }
224 print "\n";
225
226 #------------------------------------------------------------
227 # Emit String table
228 # [A table of all identifiers, that is, all long or short property
229 # or value names. The list need NOT be sorted; it will be sorted
230 # by the C program. Strings are referenced by their index into
231 # this table. After sorting, a REMAP[] array is used to map the
232 # old position indices to the new positions.]
233 my %strings;
234 for my $prop (sort keys %$h) {
235 my $hh = $h->{$prop};
236 for my $enum (sort keys %$hh) {
237 my @a = split(/\|/, $hh->{$enum});
238 for (@a) {
239 $strings{$_} = 1 if (length($_));
240 }
241 }
242 }
243 my @strings = sort keys %strings;
244 unshift @strings, "";
245
246 print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";
247
248 # while printing, create a mapping hash from string table entry to index
249 my %stringToID;
250 print "/* to be sorted */\n";
251 print "const $STRING_CLASS STRING_TABLE[] = {\n";
252 for (my $i=0; $i<@strings; ++$i) {
253 print " $STRING_CLASS(\"$strings[$i]\", $i),\n";
254 $stringToID{$strings[$i]} = $i;
255 }
256 print "};\n\n";
257
258 # placeholder for the remapping index. this is used to map
259 # indices that we compute here to indices of the sorted
260 # STRING_TABLE. STRING_TABLE will be sorted by the C++ program
261 # using the uprv_comparePropertyNames() function. this will
262 # reshuffle the order. we then use the indices (passed to the
263 # String constructor) to create a REMAP[] array.
264 print "/* to be filled in */\n";
265 print "int32_t REMAP[", scalar @strings, "];\n\n";
266
267 #------------------------------------------------------------
268 # Emit the name group table
269 # [A table of name groups. A name group is one or more names
270 # for a property or property value. The Unicode data files specify
271 # that there may be more than 2, although as of Unicode 3.2 there
272 # are at most 2. The name group table looks like this:
273 #
274 # 114, -115, 116, -117, 0, -118, 65, -64, ...
275 # [0] [2] [4] [6]
276 #
277 # The entry at [0] consists of 2 strings, 114 and 115.
278 # The entry at [2] consists of 116 and 117. The entry at
279 # [4] is one string, 118. There is always at least one
280 # string; typically there are two. If there are two, the first
281 # is the SHORT name and the second is the LONG. If there is
282 # one, then the missing entry (always the short name, in 3.2)
283 # is zero, which is by definition the index of "". The
284 # 'preferred' name will generally be the LONG name, if there are
285 # more than 2 entries. The last entry is negative.
286
287 # Build name group list and replace string refs with nameGroup indices
288 my @nameGroups;
289
290 # Check for duplicate name groups, and reuse them if possible
291 my %groupToInt; # Map group strings to ints
292 for my $prop (sort keys %$h) {
293 my $hh = $h->{$prop};
294 for my $enum (sort keys %$hh) {
295 my $groupString = $hh->{$enum};
296 my $i;
297 if (exists $groupToInt{$groupString}) {
298 $i = $groupToInt{$groupString};
299 } else {
300 my @names = split(/\|/, $groupString);
301 die "Error: Wrong number of names in " . $groupString if (@names != 2);
302 $i = @nameGroups; # index of group we are making
303 $groupToInt{$groupString} = $i; # Cache for reuse
304 push @nameGroups, map { $stringToID{$_} } @names;
305 $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
306 }
307 # now, replace string list with ref to name group
308 $hh->{$enum} = $i;
309 }
310 }
311
312 print "const int32_t NAME_GROUP_COUNT = ",
313 scalar @nameGroups, ";\n\n";
314
315 print "int32_t NAME_GROUP[] = {\n";
316 # emit one group per line, with annotations
317 for (my $i=0; $i<@nameGroups; ) {
318 my @a;
319 my $line;
320 my $start = $i;
321 for (;;) {
322 my $j = $nameGroups[$i++];
323 $line .= "$j, ";
324 push @a, abs($j);
325 last if ($j < 0);
326 }
327 print " ",
328 $line,
329 ' 'x(20-length($line)),
330 "/* ", sprintf("%3d", $start),
331 ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
332 }
333 print "};\n\n";
334
335 # This is fixed for 3.2 at "2" but should be calculated dynamically
336 # when more than 2 names appear in Property[Value]Aliases.txt.
337 print "#define MAX_NAMES_PER_GROUP 2\n\n";
338
339 #------------------------------------------------------------
340 # Emit enumerated property values
341 for my $prop (sort keys %$h) {
342 next if ($prop =~ /^_/);
343 my $vh = $h->{$prop};
344 my $count = scalar keys %$vh;
345
346 print "const int32_t VALUES_${prop}_COUNT = ",
347 $count, ";\n\n";
348
349 print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
350 for my $enum (sort keys %$vh) {
351 #my @names = split(/\|/, $vh->{$enum});
352 #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
353 # if (@names != 2);
354 print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
355 #$stringToID{$names[0]}, ", ",
356 #$stringToID{$names[1]}, "),\n";
357 # "\"", $names[0], "\", ",
358 # "\"", $names[1], "\"),\n";
359 }
360 print "};\n\n";
361 }
362
363 #------------------------------------------------------------
364 # Emit top-level properties (binary, enumerated, etc.)
365 for my $topName (sort keys %TOP_PROPS) {
366 my $a = $TOP_PROPS{$topName};
367 my $count = 0;
368 for my $type (@$a) { # "_bp", "_ep", etc.
369 $count += scalar keys %{$h->{$type}};
370 }
371
372 print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
373
374 print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
375
376 for my $type (@$a) { # "_bp", "_ep", etc.
377 my $p = $h->{$type};
378
379 for my $enum (sort keys %$p) {
380 my $name = $strings[$nameGroups[$p->{$enum}]];
381
382 my $valueRef = "0, NULL";
383 if ($type eq '_bp') {
384 $valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
385 }
386 elsif (exists $h->{$name}) {
387 $valueRef = "VALUES_${name}_COUNT, VALUES_$name";
388 }
389
390 print " $PROPERTY_CLASS((int32_t) $enum, ",
391 $p->{$enum}, ", $valueRef),\n";
392 }
393 }
394 print "};\n\n";
395 }
396
397 print "/*eof*/\n";
398}
399
400#----------------------------------------------------------------------
401# Read in the files uchar.h, uscript.h, Blocks.txt,
402# PropertyAliases.txt, and PropertyValueAliases.txt,
403# and combine them into one hash.
404#
405# @param directory containing headers
406# @param directory containin Unicode data files
407#
408# @return hash ref, Unicode version
409sub readAndMerge {
410
411 my ($headerDir, $unidataDir) = @_;
412
413 my $h = read_uchar("$headerDir/uchar.h");
414 my $s = read_uscript("$headerDir/uscript.h");
415 my $b = read_Blocks("$unidataDir/Blocks.txt");
416 my $pa = {};
417 read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
418 read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
419 my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt");
420
421 # Extract property family hash
422 my $fam = $pa->{'_family'};
423 delete $pa->{'_family'};
424
425 # Note: uscript.h has no version string, so don't check it
426 my $version = check_versions([ 'uchar.h', $h ],
427 [ 'Blocks.txt', $b ],
428 [ 'PropertyAliases.txt', $pa ],
429 [ 'PropertyValueAliases.txt', $va ]);
430
431 # Do this BEFORE merging; merging modifies the hashes
432 check_PropertyValueAliases($pa, $va);
433
434 # Dump out the $va hash for debugging
435 if ($DEBUG) {
436 print "Property values hash:\n";
437 for my $key (sort keys %$va) {
438 my $hh = $va->{$key};
439 for my $subkey (sort keys %$hh) {
440 print "$key:$subkey:", $hh->{$subkey}, "\n";
441 }
442 }
443 }
444
445 # Dump out the $s hash for debugging
446 if ($DEBUG) {
447 print "Script hash:\n";
448 for my $key (sort keys %$s) {
449 print "$key:", $s->{$key}, "\n";
450 }
451 }
452
453 # Link in the script data
454 $h->{'sc'} = $s;
455
456 merge_Blocks($h, $b);
457
458 merge_PropertyAliases($h, $pa, $fam);
459
460 merge_PropertyValueAliases($h, $va);
461
462 ($h, $version);
463}
464
465#----------------------------------------------------------------------
466# Ensure that the version strings in the given hashes (under the key
467# '_version') are compatible. Currently this means they must be
468# identical, with the exception that "X.Y" will match "X.Y.0".
469# All hashes must define the key '_version'.
470#
471# @param a list of pairs of (file name, hash reference)
472#
473# @return the version of all the hashes. Upon return, the '_version'
474# will be removed from all hashes.
475sub check_versions {
476 my $version = '';
477 my $msg = '';
478 foreach my $a (@_) {
479 my $name = $a->[0];
480 my $h = $a->[1];
481 die "Error: No version found" unless (exists $h->{'_version'});
482 my $v = $h->{'_version'};
483 delete $h->{'_version'};
484
485 # append ".0" if necessary, to standardize to X.Y.Z
486 $v .= '.0' unless ($v =~ /\.\d+\./);
487 $v .= '.0' unless ($v =~ /\.\d+\./);
488 $msg .= "$name = $v\n";
489 if ($version) {
490 die "Error: Mismatched Unicode versions\n$msg"
491 unless ($version eq $v);
492 } else {
493 $version = $v;
494 }
495 }
496 $version;
497}
498
499#----------------------------------------------------------------------
500# Make sure the property names in PropertyValueAliases.txt match those
501# in PropertyAliases.txt.
502#
503# @param a hash ref from read_PropertyAliases.
504# @param a hash ref from read_PropertyValueAliases.
505sub check_PropertyValueAliases {
506 my ($pa, $va) = @_;
507
508 # make a reverse hash of short->long
509 my %rev;
510 for (keys %$pa) { $rev{$pa->{$_}} = $_; }
511
512 for my $prop (keys %$va) {
513 if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
514 print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
515 }
516 }
517}
518
519#----------------------------------------------------------------------
520# Merge blocks data into uchar.h enum data. In the 'blk' subhash all
521# code point values, as returned from read_uchar, are replaced by
522# block names, as read from Blocks.txt and returned by read_Blocks.
523# The match must be 1-to-1. If there is any failure of 1-to-1
524# mapping, an error is signaled. Upon return, the read_Blocks hash
525# is emptied of all contents, except for those that failed to match.
526#
527# The mapping in the 'blk' subhash, after this function returns, is
528# from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
529# pseudo-name, e.g. "Basic Latin".
530#
531# @param a hash ref from read_uchar.
532# @param a hash ref from read_Blocks.
533sub merge_Blocks {
534 my ($h, $b) = @_;
535
536 die "Error: No blocks data in uchar.h"
537 unless (exists $h->{'blk'});
538 my $blk = $h->{'blk'};
539 for my $enum (keys %$blk) {
540 my $cp = $blk->{$enum};
541 if ($cp && !exists $b->{$cp}) {
542 die "Error: No block found at $cp in Blocks.txt";
543 }
544 # Convert code point to pseudo-name:
545 $blk->{$enum} = $b->{$cp};
546 delete $b->{$cp};
547 }
548 my $err = '';
549 for my $cp (keys %$b) {
550 $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
551 }
552 die $err if ($err);
553}
554
555#----------------------------------------------------------------------
556# Merge property alias names into the uchar.h hash. The subhashes
557# under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
558# examined and the values of those subhashes are assumed to be long
559# names in PropertyAliases.txt. They are validated and replaced by
560# "<short>|<long>". Upon return, the read_PropertyAliases hash is
561# emptied of all contents, except for those that failed to match.
562# Unmatched names in PropertyAliases are listed as a warning but do
563# NOT cause the script to die.
564#
565# @param a hash ref from read_uchar.
566# @param a hash ref from read_PropertyAliases.
567# @param a hash mapping long names to property family (e.g., 'binary')
568sub merge_PropertyAliases {
569 my ($h, $pa, $fam) = @_;
570
571 for my $k (@TOP) {
572 die "Error: No properties data for $k in uchar.h"
573 unless (exists $h->{$k});
574 }
575
576 for my $subh (map { $h->{$_} } @TOP) {
577 for my $enum (keys %$subh) {
578 my $name = $subh->{$enum};
579 die "Error: Property $name not found (or used more than once)"
580 unless (exists $pa->{$name});
581
582 $subh->{$enum} = $pa->{$name} . "|" . $name;
583 delete $pa->{$name};
584 }
585 }
586 my @err;
587 for my $name (keys %$pa) {
588 $MISSING_FROM_UCHAR{$pa->{$name}} = 1;
589 if (exists $UNSUPPORTED{$name}) {
590 push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
591 } elsif (!isIgnoredProperty($name)) {
592 push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
593 }
594 }
595 print join("\n", sort @err), "\n" if (@err);
596}
597
598#----------------------------------------------------------------------
599# Return 1 if two names match ignoring whitespace, '-', and '_'.
600# Used to match names in Blocks.txt with those in PropertyValueAliases.txt
601# as of Unicode 4.0.
602sub matchesLoosely {
603 my ($a, $b) = @_;
604 $a =~ s/[\s\-_]//g;
605 $b =~ s/[\s\-_]//g;
606 $a =~ /^$b$/i;
607}
608
609#----------------------------------------------------------------------
610# Merge PropertyValueAliases.txt data into the uchar.h hash. All
611# properties other than blk, _bp, and _ep are analyzed and mapped to
612# the names listed in PropertyValueAliases. They are then replaced
613# with a string of the form "<short>|<long>". The short or long name
614# may be missing.
615#
616# @param a hash ref from read_uchar.
617# @param a hash ref from read_PropertyValueAliases.
618sub merge_PropertyValueAliases {
619 my ($h, $va) = @_;
620
621 my %gcCount;
622 for my $prop (keys %$h) {
623 # _bp, _ep handled in merge_PropertyAliases
624 next if ($prop =~ /^_/);
625
626 # Special case: gcm
627 my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
628
629 # find corresponding PropertyValueAliases data
630 die "Error: Can't find $prop in PropertyValueAliases.txt"
631 unless (exists $va->{$prop2});
632 my $pva = $va->{$prop2};
633
634 # match up data
635 my $hh = $h->{$prop};
636 for my $enum (keys %$hh) {
637
638 my $name = $hh->{$enum};
639
640 # look up both long and short & ignore case
641 my $n;
642 if (exists $pva->{$name}) {
643 $n = $name;
644 } else {
645 # iterate (slow)
646 for my $a (keys %$pva) {
647 # case-insensitive match
648 # & case-insensitive reverse match
649 if ($a =~ /^$name$/i ||
650 $pva->{$a} =~ /^$name$/i) {
651 $n = $a;
652 last;
653 }
654 }
655 }
656
657 # For blocks, do a loose match from Blocks.txt pseudo-name
658 # to PropertyValueAliases long name.
659 if (!$n && $prop eq 'blk') {
660 for my $a (keys %$pva) {
661 # The block is only going to match the long name,
662 # but we check both for completeness. As of Unicode
663 # 4.0, blocks do not have short names.
664 if (matchesLoosely($name, $pva->{$a}) ||
665 matchesLoosely($name, $a)) {
666 $n = $a;
667 last;
668 }
669 }
670 }
671
672 die "Error: Property value $prop:$name not found" unless ($n);
673
674 my $l = $n;
675 my $r = $pva->{$n};
676 # convert |n/a\d+| to blank
677 $l = '' if ($l =~ m|^n/a\d+$|);
678 $r = '' if ($r =~ m|^n/a\d+$|);
679
680 $hh->{$enum} = "$l|$r";
681 # Don't delete the 'gc' properties because we need to share
682 # them between 'gc' and 'gcm'. Count each use instead.
683 if ($prop2 eq 'gc') {
684 ++$gcCount{$n};
685 } else {
686 delete $pva->{$n};
687 }
688 }
689 }
690
691 # Merge the combining class values in manually
692 die "Error: No ccc data"
693 unless exists $va->{'ccc'};
694 for my $ccc (keys %{$va->{'ccc'}}) {
695 die "Error: Can't overwrite ccc $ccc"
696 if (exists $h->{'ccc'}->{$ccc});
697 $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
698 }
699 delete $va->{'ccc'};
700
701 # Merge synthetic binary property values in manually.
702 # These are the "True" and "False" value aliases.
703 die "Error: No True/False value aliases"
704 unless exists $va->{'binprop'};
705 for my $bp (keys %{$va->{'binprop'}}) {
706 $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
707 }
708 delete $va->{'binprop'};
709
710 my $err = '';
711 for my $prop (sort keys %$va) {
712 my $hh = $va->{$prop};
713 for my $subkey (sort keys %$hh) {
714 # 'gc' props are shared with 'gcm'; make sure they were used
715 # once or twice.
716 if ($prop eq 'gc') {
717 my $n = $gcCount{$subkey};
718 next if ($n >= 1 && $n <= 2);
719 }
720 $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
721 unless exists $MISSING_FROM_UCHAR{$prop};
722 }
723 }
724 print $err if ($err);
725}
726
727#----------------------------------------------------------------------
728# Read the PropertyAliases.txt file. Return a hash that maps the long
729# name to the short name. The special key '_version' will map to the
730# Unicode version of the file. The special key '_family' holds a
731# subhash that maps long names to a family string, for descriptive
732# purposes.
733#
734# @param a filename for PropertyAliases.txt
735# @param reference to hash to receive data. Keys are long names.
736# Values are short names.
737sub read_PropertyAliases {
738
739 my $hash = shift; # result
740
741 my $filename = shift;
742
743 my $fam = {}; # map long names to family string
744 $fam = $hash->{'_family'} if (exists $hash->{'_family'});
745
746 my $family; # binary, enumerated, etc.
747
748 my $in = new FileHandle($filename, 'r');
749 die "Error: Cannot open $filename" if (!defined $in);
750
751 while (<$in>) {
752
753 # Read version (embedded in a comment)
754 if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
755 die "Error: Multiple versions in $filename"
756 if (exists $hash->{'_version'});
757 $hash->{'_version'} = $1;
758 }
759
760 # Read family heading
761 if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
762 $family = $1;
763 }
764
765 # Ignore comments and blank lines
766 s/\#.*//;
767 next unless (/\S/);
768
769 if (/^\s*(.+?)\s*;\s*(.+?)\s*$/i) {
770 die "Error: Duplicate property $1 in $filename"
771 if (exists $hash->{$2});
772 $hash->{$2} = $1;
773 $fam->{$2} = $family;
774 }
775
776 else {
777 die "Error: Can't parse $_ in $filename";
778 }
779 }
780
781 $in->close();
782
783 $hash->{'_family'} = $fam;
784}
785
786#----------------------------------------------------------------------
787# Read the PropertyValueAliases.txt file. Return a two level hash
788# that maps property_short_name:value_short_name:value_long_name. In
789# the case of the 'ccc' property, the short name is the numeric class
790# and the long name is "<short>|<long>". The special key '_version'
791# will map to the Unicode version of the file.
792#
793# @param a filename for PropertyValueAliases.txt
794#
795# @return a hash reference.
796sub read_PropertyValueAliases {
797
798 my $filename = shift;
799
800 my $hash = {}; # result
801
802 my $in = new FileHandle($filename, 'r');
803 die "Error: Cannot open $filename" if (!defined $in);
804
805 my $sym = 0; # Used to make "n/a" strings unique
806
807 while (<$in>) {
808
809 # Read version (embedded in a comment)
810 if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
811 die "Error: Multiple versions in $filename"
812 if (exists $hash->{'_version'});
813 $hash->{'_version'} = $1;
814 }
815
816 # Ignore comments and blank lines
817 s/\#.*//;
818 next unless (/\S/);
819
820 if (/^\s*(.+?)\s*;/i) {
821 my $prop = $1;
822 my @fields = /;\s*([^\s;]+)/g;
823 die "Error: Wrong number of fields"
824 if (@fields < 2 || @fields > 3);
825 # Make "n/a" strings unique
826 $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a');
827 # Squash extra fields together
828 while (@fields > 2) {
829 my $f = pop @fields;
830 $fields[$#fields] .= '|' . $f;
831 }
832 addDatum($hash, $prop, @fields);
833 }
834
835 else {
836 die "Error: Can't parse $_ in $filename";
837 }
838 }
839
840 $in->close();
841
842 # Script Qaac (Coptic) is a special case. Handle it here. See UTR#24:
843 # http://www.unicode.org/unicode/reports/tr24/
844 $hash->{'sc'}->{'Qaac'} = 'Coptic'
845 unless (exists $hash->{'sc'}->{'Qaac'});
846
847 # Add T|True and F|False -- these are values we recognize for
848 # binary properties (NOT from PropertyValueAliases.txt). These
849 # are of the same form as the 'ccc' value aliases.
850 $hash->{'binprop'}->{'0'} = 'F|False';
851 $hash->{'binprop'}->{'1'} = 'T|True';
852
853 $hash;
854}
855
856#----------------------------------------------------------------------
857# Read the Blocks.txt file. Return a hash that maps the code point
858# range start to the block name. The special key '_version' will map
859# to the Unicode version of the file.
860#
861# As of Unicode 4.0, the names in the Blocks.txt are no longer the
862# proper names. The proper names are now listed in PropertyValueAliases.
863# They are similar but not identical. Furthermore, 4.0 introduces
864# a new block name, No_Block, which is listed only in PropertyValueAliases
865# and not in Blocks.txt. As a result, we handle blocks as follows:
866#
867# 1. Read Blocks.txt to map code point range start to quasi-block name.
868# 2. Add to Blocks.txt a synthetic No Block code point & name:
869# X -> No Block
870# 3. Map quasi-names from Blocks.txt (including No Block) to actual
871# names from PropertyValueAliases. This occurs in
872# merge_PropertyValueAliases.
873#
874# @param a filename for Blocks.txt
875#
876# @return a ref to a hash. Keys are code points, as text, e.g.,
877# "1720". Values are pseudo-block names, e.g., "Hanunoo".
878sub read_Blocks {
879
880 my $filename = shift;
881
882 my $hash = {}; # result
883
884 my $in = new FileHandle($filename, 'r');
885 die "Error: Cannot open $filename" if (!defined $in);
886
887 while (<$in>) {
888
889 # Read version (embedded in a comment)
890 if (/Blocks-(\d+\.\d+\.\d+)/i) {
891 die "Error: Multiple versions in $filename"
892 if (exists $hash->{'_version'});
893 $hash->{'_version'} = $1;
894 }
895
896 # Ignore comments and blank lines
897 s/\#.*//;
898 next unless (/\S/);
899
900 if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) {
901 die "Error: Duplicate range $1 in $filename"
902 if (exists $hash->{$1});
903 $hash->{$1} = $2;
904 }
905
906 else {
907 die "Error: Can't parse $_ in $filename";
908 }
909 }
910
911 $in->close();
912
913 # Add pseudo-name for No Block
914 $hash->{'none'} = 'No Block';
915
916 $hash;
917}
918
919#----------------------------------------------------------------------
920# Read the uscript.h file and compile a mapping of Unicode symbols to
921# icu4c enum values.
922#
923# @param a filename for uscript.h
924#
925# @return a ref to a hash. The keys of the hash are enum symbols from
926# uscript.h, and the values are script names.
927sub read_uscript {
928
929 my $filename = shift;
930
931 my $mode = ''; # state machine mode and submode
932 my $submode = '';
933
934 my $last = ''; # for line folding
935
936 my $hash = {}; # result
937 my $key; # first-level key
938
939 my $in = new FileHandle($filename, 'r');
940 die "Error: Cannot open $filename" if (!defined $in);
941
942 while (<$in>) {
943 # Fold continued lines together
944 if (/^(.*)\\$/) {
945 $last = $1;
946 next;
947 } elsif ($last) {
948 $_ = $last . $_;
949 $last = '';
950 }
951
952 # Exit all modes here
953 if ($mode && $mode ne 'DEPRECATED') {
954 if (/^\s*\}/) {
955 $mode = '';
956 next;
957 }
958 }
959
960 # Handle individual modes
961
962 if ($mode eq 'UScriptCode') {
963 if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
964 my ($enum, $code) = ($1, $2);
965 die "Error: Duplicate script $enum"
966 if (exists $hash->{$enum});
967 $hash->{$enum} = $code;
968 }
969 }
970
971 elsif ($mode eq 'DEPRECATED') {
972 if (/\s*\#ifdef/) {
973 die "Error: Nested #ifdef";
974 }
975 elsif (/\s*\#endif/) {
976 $mode = '';
977 }
978 }
979
980 elsif (!$mode) {
981 if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
982 /^\s*typedef\s+enum\s+(\w+)\s*$/) {
983 $mode = $1;
984 #print "Parsing $mode\n";
985 }
986
987 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
988 $mode = 'DEPRECATED';
989 }
990 }
991 }
992
993 $in->close();
994
995 $hash;
996}
997
998#----------------------------------------------------------------------
999# Read the uchar.h file and compile a mapping of Unicode symbols to
1000# icu4c enum values.
1001#
1002# @param a filename for uchar.h
1003#
1004# @return a ref to a hash. The keys of the hash are '_bp' for binary
1005# properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
1006# double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
1007# 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
1008# value aliases. The values of the hash are subhashes. The subhashes
1009# have a key of the uchar.h enum symbol, and a value of the alias
1010# string (as listed in PropertyValueAliases.txt). NOTE: The alias
1011# string is whatever alias uchar.h lists. This may be either short or
1012# long, depending on the specific enum. NOTE: For blocks ('blk'), the
1013# value is a hex code point for the start of the associated block.
1014# NOTE: The special key _version will map to the Unicode version of
1015# the file.
1016sub read_uchar {
1017
1018 my $filename = shift;
1019
1020 my $mode = ''; # state machine mode and submode
1021 my $submode = '';
1022
1023 my $last = ''; # for line folding
1024
1025 my $hash = {}; # result
1026 my $key; # first-level key
1027
1028 my $in = new FileHandle($filename, 'r');
1029 die "Error: Cannot open $filename" if (!defined $in);
1030
1031 while (<$in>) {
1032 # Fold continued lines together
1033 if (/^(.*)\\$/) {
1034 $last .= $1;
1035 next;
1036 } elsif ($last) {
1037 $_ = $last . $_;
1038 $last = '';
1039 }
1040
1041 # Exit all modes here
1042 if ($mode && $mode ne 'DEPRECATED') {
1043 if (/^\s*\}/) {
1044 $mode = '';
1045 next;
1046 }
1047 }
1048
1049 # Handle individual modes
1050
1051 if ($mode eq 'UProperty') {
1052 if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
1053 if ($submode) {
1054 addDatum($hash, $key, $1, $submode);
1055 $submode = '';
1056 } else {
1057 #print "Warning: Ignoring $1\n";
1058 }
1059 }
1060
1061 elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
1062 die "Error: Unmatched tag $submode" if ($submode);
1063 die "Error: Unrecognized UProperty comment: $_"
1064 unless (exists $PROP_TYPE{$1});
1065 $key = $PROP_TYPE{$1};
1066 $submode = $2;
1067 }
1068 }
1069
1070 elsif ($mode eq 'UCharCategory') {
1071 if (/^\s*(U_\w+)\s*=/) {
1072 if ($submode) {
1073 addDatum($hash, 'gc', $1, $submode);
1074 $submode = '';
1075 } else {
1076 #print "Warning: Ignoring $1\n";
1077 }
1078 }
1079
1080 elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
1081 die "Error: Unmatched tag $submode" if ($submode);
1082 $submode = $1;
1083 }
1084 }
1085
1086 elsif ($mode eq 'UCharDirection') {
1087 if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
1088 if ($submode) {
1089 addDatum($hash, $key, $1, $submode);
1090 $submode = '';
1091 } else {
1092 #print "Warning: Ignoring $1\n";
1093 }
1094 }
1095
1096 elsif (m|/\*\*\s*([A-Z]+)\s|) {
1097 die "Error: Unmatched tag $submode" if ($submode);
1098 $key = 'bc';
1099 $submode = $1;
1100 }
1101 }
1102
1103 elsif ($mode eq 'UBlockCode') {
1104 if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
1105 addDatum($hash, 'blk', $1, $2);
1106 }
1107 }
1108
1109 elsif ($mode eq 'UEastAsianWidth') {
1110 if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
1111 addDatum($hash, 'ea', $1, $2);
1112 }
1113 }
1114
1115 elsif ($mode eq 'UDecompositionType') {
1116 if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
1117 addDatum($hash, 'dt', $1, $2);
1118 }
1119 }
1120
1121 elsif ($mode eq 'UJoiningType') {
1122 if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
1123 addDatum($hash, 'jt', $1, $2);
1124 }
1125 }
1126
1127 elsif ($mode eq 'UJoiningGroup') {
1128 if (/^\s*(U_JG_(\w+))/) {
1129 addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
1130 }
1131 }
1132
1133 elsif ($mode eq 'ULineBreak') {
1134 if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
1135 addDatum($hash, 'lb', $1, $2);
1136 }
1137 }
1138
1139 elsif ($mode eq 'UNumericType') {
1140 if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
1141 addDatum($hash, 'nt', $1, $2);
1142 }
1143 }
1144
1145 elsif ($mode eq 'UHangulSyllableType') {
1146 if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
1147 addDatum($hash, 'hst', $1, $2);
1148 }
1149 }
1150
1151 elsif ($mode eq 'DEPRECATED') {
1152 if (/\s*\#ifdef/) {
1153 die "Error: Nested #ifdef";
1154 }
1155 elsif (/\s*\#endif/) {
1156 $mode = '';
1157 }
1158 }
1159
1160 elsif (!$mode) {
1161 if (/^\s*\#define\s+(\w+)\s+(.+)/) {
1162 # #define $left $right
1163 my ($left, $right) = ($1, $2);
1164
1165 if ($left eq 'U_UNICODE_VERSION') {
1166 my $version = $right;
1167 $version = $1 if ($version =~ /^\"(.*)\"/);
1168 # print "Unicode version: ", $version, "\n";
1169 die "Error: Multiple versions in $filename"
1170 if (defined $hash->{'_version'});
1171 $hash->{'_version'} = $version;
1172 }
1173
1174 elsif ($left =~ /U_GC_(\w+?)_MASK/) {
1175 addDatum($hash, 'gcm', $left, $1);
1176 }
1177 }
1178
1179 elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
1180 /^\s*typedef\s+enum\s+(\w+)\s*$/) {
1181 $mode = $1;
1182 #print "Parsing $mode\n";
1183 }
1184
1185 elsif (/^\s*enum\s+(\w+)\s*\{/ ||
1186 /^\s*enum\s+(\w+)\s*$/) {
1187 $mode = $1;
1188 #print "Parsing $mode\n";
1189 }
1190
1191 elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
1192 $mode = 'DEPRECATED';
1193 }
1194 }
1195 }
1196
1197 $in->close();
1198
1199 $hash;
1200}
1201
1202#----------------------------------------------------------------------
1203# Add a new value to a two-level hash. That is, given a ref to
1204# a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
1205sub addDatum {
1206 my ($h, $k1, $k2, $v) = @_;
1207 if (exists $h->{$k1}->{$k2}) {
1208 die "Error: $k1:$k2 already set to " .
1209 $h->{$k1}->{$k2} . ", cannot set to " . $v;
1210 }
1211 $h->{$k1}->{$k2} = $v;
1212}
1213
1214#eof