]>
Commit | Line | Data |
---|---|---|
1 | #!/bin/perl -w | |
2 | #******************************************************************* | |
3 | # COPYRIGHT: | |
4 | # Copyright (c) 2002-2006, International Business Machines Corporation and | |
5 | # others. All Rights Reserved. | |
6 | #******************************************************************* | |
7 | ||
8 | # This script reads in UCD files PropertyAliases.txt and | |
9 | # PropertyValueAliases.txt and correlates them with ICU enums | |
10 | # defined in uchar.h and uscript.h. It then outputs a header | |
11 | # file which contains all names and enums. The header is included | |
12 | # by the genpname tool C++ source file, which produces the actual | |
13 | # binary data file. | |
14 | # | |
15 | # See usage note below. | |
16 | # | |
17 | # TODO: The Property[Value]Alias.txt files state that they can support | |
18 | # more than 2 names per property|value. Currently (Unicode 3.2) there | |
19 | # are always 1 or 2 names. If more names were supported, presumably | |
20 | # the format would be something like: | |
21 | # nv ; Numeric_Value | |
22 | # nv ; Value_Numerique | |
23 | # CURRENTLY, this script assumes that there are 1 or two names. Any | |
24 | # duplicates it sees are flagged as an error. If multiple aliases | |
25 | # appear in a future version of Unicode, modify this script to support | |
26 | # that. | |
27 | # | |
28 | # NOTE: As of ICU 2.6, this script has been modified to know about the | |
29 | # pseudo-property gcm/General_Category_Mask, which corresponds to the | |
30 | # uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property | |
31 | # corresponds to General_Category but is a bitmask value. It does not | |
32 | # exist in the UCD. Therefore, I special case it in several places | |
33 | # (search for General_Category_Mask and gcm). | |
34 | # | |
35 | # NOTE: As of ICU 2.6, this script reads an auxiliary data file, | |
36 | # SyntheticPropertyAliases.txt, containing property aliases not | |
37 | # present in the UCD but present in ICU. This file resides in the | |
38 | # same directory as this script. Its contents are merged into those | |
39 | # of PropertyAliases.txt as if the two files were appended. | |
40 | # | |
41 | # NOTE: The following names are handled specially. See script below | |
42 | # for details. | |
43 | # | |
44 | # T/True | |
45 | # F/False | |
46 | # No_Block | |
47 | # | |
48 | # Author: Alan Liu | |
49 | # Created: October 14 2002 | |
50 | # Since: ICU 2.4 | |
51 | ||
52 | use FileHandle; | |
53 | use strict; | |
54 | use Dumpvalue; | |
55 | ||
56 | my $DEBUG = 1; | |
57 | my $DUMPER = new Dumpvalue; | |
58 | ||
59 | my $count = @ARGV; | |
60 | my $ICU_DIR = shift() || ''; | |
61 | my $OUT_FILE = shift() || 'data.h'; | |
62 | my $HEADER_DIR = "$ICU_DIR/source/common/unicode"; | |
63 | my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata"; | |
64 | ||
65 | # Get the current year from the system | |
66 | my $YEAR = 1900+@{[localtime]}[5]; # Get the current year | |
67 | ||
68 | # Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique | |
69 | my $propNA = 0; | |
70 | my $valueNA = 0; | |
71 | ||
72 | #---------------------------------------------------------------------- | |
73 | # Top level property keys for binary, enumerated, string, and double props | |
74 | my @TOP = qw( _bp _ep _sp _dp _mp ); | |
75 | ||
76 | # This hash governs how top level properties are grouped into output arrays. | |
77 | #my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ], | |
78 | # "NO_VALUE" => [ '_sp', '_dp' ] );m | |
79 | #my %TOP_PROPS = ( "BINARY" => [ '_bp' ], | |
80 | # "ENUMERATED" => [ '_ep' ], | |
81 | # "STRING" => [ '_sp' ], | |
82 | # "DOUBLE" => [ '_dp' ] ); | |
83 | my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] ); | |
84 | ||
85 | my %PROP_TYPE = (Binary => "_bp", | |
86 | String => "_sp", | |
87 | Double => "_dp", | |
88 | Enumerated => "_ep", | |
89 | Bitmask => "_mp"); | |
90 | #---------------------------------------------------------------------- | |
91 | ||
92 | # Properties that are unsupported in ICU | |
93 | my %UNSUPPORTED = (Composition_Exclusion => 1, | |
94 | Decomposition_Mapping => 1, | |
95 | Expands_On_NFC => 1, | |
96 | Expands_On_NFD => 1, | |
97 | Expands_On_NFKC => 1, | |
98 | Expands_On_NFKD => 1, | |
99 | FC_NFKC_Closure => 1, | |
100 | ID_Start_Exceptions => 1, | |
101 | Special_Case_Condition => 1, | |
102 | ); | |
103 | ||
104 | # Short names of properties that weren't seen in uchar.h. If the | |
105 | # properties weren't seen, don't complain about the property values | |
106 | # missing. | |
107 | my %MISSING_FROM_UCHAR; | |
108 | ||
109 | # Additional property aliases beyond short and long names, | |
110 | # like space in addition to WSpace and White_Space in Unicode 4.1. | |
111 | # Hashtable, maps long name to alias. | |
112 | # For example, maps White_Space->space. | |
113 | # | |
114 | # If multiple additional aliases are defined, | |
115 | # then they are separated in the value string with '|'. | |
116 | # For example, White_Space->space|outer_space | |
117 | my %additional_property_aliases; | |
118 | ||
119 | #---------------------------------------------------------------------- | |
120 | ||
121 | # Emitted class names | |
122 | my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property); | |
123 | ||
124 | if ($count < 1 || $count > 2 || | |
125 | !-d $HEADER_DIR || | |
126 | !-d $UNIDATA_DIR) { | |
127 | my $me = $0; | |
128 | $me =~ s|.+[/\\]||; | |
129 | my $lm = ' ' x length($me); | |
130 | print <<"END"; | |
131 | ||
132 | $me: Reads ICU4C headers and Unicode data files and creates | |
133 | $lm a C header file that is included by genpname. The header | |
134 | $lm file matches constants defined in the ICU4C headers with | |
135 | $lm property|value aliases in the Unicode data files. | |
136 | ||
137 | Usage: $me <icu_dir> [<out_file>] | |
138 | ||
139 | <icu_dir> ICU4C root directory, containing | |
140 | source/common/unicode/uchar.h | |
141 | source/common/unicode/uscript.h | |
142 | source/data/unidata/Blocks.txt | |
143 | source/data/unidata/PropertyAliases.txt | |
144 | source/data/unidata/PropertyValueAliases.txt | |
145 | <out_file> File name of header to be written; | |
146 | default is 'data.h'. | |
147 | ||
148 | The Unicode versions of all input files must match. | |
149 | END | |
150 | exit(1); | |
151 | } | |
152 | ||
153 | my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR); | |
154 | ||
155 | if ($DEBUG) { | |
156 | print "Merged hash:\n"; | |
157 | for my $key (sort keys %$h) { | |
158 | my $hh = $h->{$key}; | |
159 | for my $subkey (sort keys %$hh) { | |
160 | print "$key:$subkey:", $hh->{$subkey}, "\n"; | |
161 | } | |
162 | } | |
163 | } | |
164 | ||
165 | my $out = new FileHandle($OUT_FILE, 'w'); | |
166 | die "Error: Can't write to $OUT_FILE: $!" unless (defined $out); | |
167 | my $save = select($out); | |
168 | formatData($h, $version); | |
169 | select($save); | |
170 | $out->close(); | |
171 | ||
172 | exit(0); | |
173 | ||
174 | #---------------------------------------------------------------------- | |
175 | # From PropList.html: "The properties of the form Other_XXX | |
176 | # are used to generate properties in DerivedCoreProperties.txt. | |
177 | # They are not intended for general use, such as in APIs that | |
178 | # return property values. | |
179 | # Non_Break is not a valid property as of 3.2. | |
180 | sub isIgnoredProperty { | |
181 | local $_ = shift; | |
182 | /^Other_/i || /^Non_Break$/i; | |
183 | } | |
184 | ||
185 | # 'qc' is a pseudo-property matching any quick-check property | |
186 | # see PropertyValueAliases.txt file comments. 'binprop' is | |
187 | # a synthetic binary value alias "True"/"False", not present | |
188 | # in PropertyValueAliases.txt. | |
189 | sub isPseudoProperty { | |
190 | $_[0] eq 'qc' || | |
191 | $_[0] eq 'binprop'; | |
192 | } | |
193 | ||
194 | #---------------------------------------------------------------------- | |
195 | # Emit the combined data from headers and the Unicode database as a | |
196 | # C source code header file. | |
197 | # | |
198 | # @param ref to hash with the data | |
199 | # @param Unicode version, as a string | |
200 | sub formatData { | |
201 | my $h = shift; | |
202 | my $version = shift; | |
203 | ||
204 | my $date = scalar localtime(); | |
205 | print <<"END"; | |
206 | /** | |
207 | * Copyright (C) 2002-$YEAR, International Business Machines Corporation and | |
208 | * others. All Rights Reserved. | |
209 | * | |
210 | * MACHINE GENERATED FILE. !!! Do not edit manually !!! | |
211 | * | |
212 | * Generated from | |
213 | * uchar.h | |
214 | * uscript.h | |
215 | * Blocks.txt | |
216 | * PropertyAliases.txt | |
217 | * PropertyValueAliases.txt | |
218 | * | |
219 | * Date: $date | |
220 | * Unicode version: $version | |
221 | * Script: $0 | |
222 | */ | |
223 | ||
224 | END | |
225 | ||
226 | #------------------------------------------------------------ | |
227 | # Emit Unicode version | |
228 | print "/* Unicode version $version */\n"; | |
229 | my @v = split(/\./, $version); | |
230 | push @v, '0' while (@v < 4); | |
231 | for (my $i=0; $i<@v; ++$i) { | |
232 | print "const uint8_t VERSION_$i = $v[$i];\n"; | |
233 | } | |
234 | print "\n"; | |
235 | ||
236 | #------------------------------------------------------------ | |
237 | # Emit String table | |
238 | # [A table of all identifiers, that is, all long or short property | |
239 | # or value names. The list need NOT be sorted; it will be sorted | |
240 | # by the C program. Strings are referenced by their index into | |
241 | # this table. After sorting, a REMAP[] array is used to map the | |
242 | # old position indices to the new positions.] | |
243 | my %strings; | |
244 | for my $prop (sort keys %$h) { | |
245 | my $hh = $h->{$prop}; | |
246 | for my $enum (sort keys %$hh) { | |
247 | my @a = split(/\|/, $hh->{$enum}); | |
248 | for (@a) { | |
249 | $strings{$_} = 1 if (length($_)); | |
250 | } | |
251 | } | |
252 | } | |
253 | my @strings = sort keys %strings; | |
254 | unshift @strings, ""; | |
255 | ||
256 | print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n"; | |
257 | ||
258 | # while printing, create a mapping hash from string table entry to index | |
259 | my %stringToID; | |
260 | print "/* to be sorted */\n"; | |
261 | print "const $STRING_CLASS STRING_TABLE[] = {\n"; | |
262 | for (my $i=0; $i<@strings; ++$i) { | |
263 | print " $STRING_CLASS(\"$strings[$i]\", $i),\n"; | |
264 | $stringToID{$strings[$i]} = $i; | |
265 | } | |
266 | print "};\n\n"; | |
267 | ||
268 | # placeholder for the remapping index. this is used to map | |
269 | # indices that we compute here to indices of the sorted | |
270 | # STRING_TABLE. STRING_TABLE will be sorted by the C++ program | |
271 | # using the uprv_comparePropertyNames() function. this will | |
272 | # reshuffle the order. we then use the indices (passed to the | |
273 | # String constructor) to create a REMAP[] array. | |
274 | print "/* to be filled in */\n"; | |
275 | print "int32_t REMAP[", scalar @strings, "];\n\n"; | |
276 | ||
277 | #------------------------------------------------------------ | |
278 | # Emit the name group table | |
279 | # [A table of name groups. A name group is one or more names | |
280 | # for a property or property value. The Unicode data files specify | |
281 | # that there may be more than 2, although as of Unicode 3.2 there | |
282 | # are at most 2. The name group table looks like this: | |
283 | # | |
284 | # 114, -115, 116, -117, 0, -118, 65, -64, ... | |
285 | # [0] [2] [4] [6] | |
286 | # | |
287 | # The entry at [0] consists of 2 strings, 114 and 115. | |
288 | # The entry at [2] consists of 116 and 117. The entry at | |
289 | # [4] is one string, 118. There is always at least one | |
290 | # string; typically there are two. If there are two, the first | |
291 | # is the SHORT name and the second is the LONG. If there is | |
292 | # one, then the missing entry (always the short name, in 3.2) | |
293 | # is zero, which is by definition the index of "". The | |
294 | # 'preferred' name will generally be the LONG name, if there are | |
295 | # more than 2 entries. The last entry is negative. | |
296 | ||
297 | # Build name group list and replace string refs with nameGroup indices | |
298 | my @nameGroups; | |
299 | ||
300 | # Check for duplicate name groups, and reuse them if possible | |
301 | my %groupToInt; # Map group strings to ints | |
302 | for my $prop (sort keys %$h) { | |
303 | my $hh = $h->{$prop}; | |
304 | for my $enum (sort keys %$hh) { | |
305 | my $groupString = $hh->{$enum}; | |
306 | my $i; | |
307 | if (exists $groupToInt{$groupString}) { | |
308 | $i = $groupToInt{$groupString}; | |
309 | } else { | |
310 | my @names = split(/\|/, $groupString); | |
311 | die "Error: Wrong number of names in " . $groupString if (@names < 1); | |
312 | $i = @nameGroups; # index of group we are making | |
313 | $groupToInt{$groupString} = $i; # Cache for reuse | |
314 | push @nameGroups, map { $stringToID{$_} } @names; | |
315 | $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end | |
316 | } | |
317 | # now, replace string list with ref to name group | |
318 | $hh->{$enum} = $i; | |
319 | } | |
320 | } | |
321 | ||
322 | print "const int32_t NAME_GROUP_COUNT = ", | |
323 | scalar @nameGroups, ";\n\n"; | |
324 | ||
325 | print "int32_t NAME_GROUP[] = {\n"; | |
326 | # emit one group per line, with annotations | |
327 | my $max_names = 0; | |
328 | for (my $i=0; $i<@nameGroups; ) { | |
329 | my @a; | |
330 | my $line; | |
331 | my $start = $i; | |
332 | for (;;) { | |
333 | my $j = $nameGroups[$i++]; | |
334 | $line .= "$j, "; | |
335 | push @a, abs($j); | |
336 | last if ($j < 0); | |
337 | } | |
338 | print " ", | |
339 | $line, | |
340 | ' 'x(20-length($line)), | |
341 | "/* ", sprintf("%3d", $start), | |
342 | ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n"; | |
343 | $max_names = @a if(@a > $max_names); | |
344 | ||
345 | } | |
346 | print "};\n\n"; | |
347 | ||
348 | # This is fixed for 3.2 at "2" but should be calculated dynamically | |
349 | # when more than 2 names appear in Property[Value]Aliases.txt. | |
350 | print "#define MAX_NAMES_PER_GROUP $max_names\n\n"; | |
351 | ||
352 | #------------------------------------------------------------ | |
353 | # Emit enumerated property values | |
354 | for my $prop (sort keys %$h) { | |
355 | next if ($prop =~ /^_/); | |
356 | my $vh = $h->{$prop}; | |
357 | my $count = scalar keys %$vh; | |
358 | ||
359 | print "const int32_t VALUES_${prop}_COUNT = ", | |
360 | $count, ";\n\n"; | |
361 | ||
362 | print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n"; | |
363 | for my $enum (sort keys %$vh) { | |
364 | #my @names = split(/\|/, $vh->{$enum}); | |
365 | #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]" | |
366 | # if (@names != 2); | |
367 | print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n"; | |
368 | #$stringToID{$names[0]}, ", ", | |
369 | #$stringToID{$names[1]}, "),\n"; | |
370 | # "\"", $names[0], "\", ", | |
371 | # "\"", $names[1], "\"),\n"; | |
372 | } | |
373 | print "};\n\n"; | |
374 | } | |
375 | ||
376 | #------------------------------------------------------------ | |
377 | # Emit top-level properties (binary, enumerated, etc.) | |
378 | for my $topName (sort keys %TOP_PROPS) { | |
379 | my $a = $TOP_PROPS{$topName}; | |
380 | my $count = 0; | |
381 | for my $type (@$a) { # "_bp", "_ep", etc. | |
382 | $count += scalar keys %{$h->{$type}}; | |
383 | } | |
384 | ||
385 | print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n"; | |
386 | ||
387 | print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n"; | |
388 | ||
389 | for my $type (@$a) { # "_bp", "_ep", etc. | |
390 | my $p = $h->{$type}; | |
391 | ||
392 | for my $enum (sort keys %$p) { | |
393 | my $name = $strings[$nameGroups[$p->{$enum}]]; | |
394 | ||
395 | my $valueRef = "0, NULL"; | |
396 | if ($type eq '_bp') { | |
397 | $valueRef = "VALUES_binprop_COUNT, VALUES_binprop"; | |
398 | } | |
399 | elsif (exists $h->{$name}) { | |
400 | $valueRef = "VALUES_${name}_COUNT, VALUES_$name"; | |
401 | } | |
402 | ||
403 | print " $PROPERTY_CLASS((int32_t) $enum, ", | |
404 | $p->{$enum}, ", $valueRef),\n"; | |
405 | } | |
406 | } | |
407 | print "};\n\n"; | |
408 | } | |
409 | ||
410 | print "/*eof*/\n"; | |
411 | } | |
412 | ||
413 | #---------------------------------------------------------------------- | |
414 | # Read in the files uchar.h, uscript.h, Blocks.txt, | |
415 | # PropertyAliases.txt, and PropertyValueAliases.txt, | |
416 | # and combine them into one hash. | |
417 | # | |
418 | # @param directory containing headers | |
419 | # @param directory containin Unicode data files | |
420 | # | |
421 | # @return hash ref, Unicode version | |
422 | sub readAndMerge { | |
423 | ||
424 | my ($headerDir, $unidataDir) = @_; | |
425 | ||
426 | my $h = read_uchar("$headerDir/uchar.h"); | |
427 | my $s = read_uscript("$headerDir/uscript.h"); | |
428 | my $b = read_Blocks("$unidataDir/Blocks.txt"); | |
429 | my $pa = {}; | |
430 | read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt"); | |
431 | read_PropertyAliases($pa, "SyntheticPropertyAliases.txt"); | |
432 | my $va = {}; | |
433 | read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt"); | |
434 | read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt"); | |
435 | ||
436 | # Extract property family hash | |
437 | my $fam = $pa->{'_family'}; | |
438 | delete $pa->{'_family'}; | |
439 | ||
440 | # Note: uscript.h has no version string, so don't check it | |
441 | my $version = check_versions([ 'uchar.h', $h ], | |
442 | [ 'Blocks.txt', $b ], | |
443 | [ 'PropertyAliases.txt', $pa ], | |
444 | [ 'PropertyValueAliases.txt', $va ]); | |
445 | ||
446 | # Do this BEFORE merging; merging modifies the hashes | |
447 | check_PropertyValueAliases($pa, $va); | |
448 | ||
449 | # Dump out the $va hash for debugging | |
450 | if ($DEBUG) { | |
451 | print "Property values hash:\n"; | |
452 | for my $key (sort keys %$va) { | |
453 | my $hh = $va->{$key}; | |
454 | for my $subkey (sort keys %$hh) { | |
455 | print "$key:$subkey:", $hh->{$subkey}, "\n"; | |
456 | } | |
457 | } | |
458 | } | |
459 | ||
460 | # Dump out the $s hash for debugging | |
461 | if ($DEBUG) { | |
462 | print "Script hash:\n"; | |
463 | for my $key (sort keys %$s) { | |
464 | print "$key:", $s->{$key}, "\n"; | |
465 | } | |
466 | } | |
467 | ||
468 | # Link in the script data | |
469 | $h->{'sc'} = $s; | |
470 | ||
471 | merge_Blocks($h, $b); | |
472 | ||
473 | merge_PropertyAliases($h, $pa, $fam); | |
474 | ||
475 | merge_PropertyValueAliases($h, $va); | |
476 | ||
477 | ($h, $version); | |
478 | } | |
479 | ||
480 | #---------------------------------------------------------------------- | |
481 | # Ensure that the version strings in the given hashes (under the key | |
482 | # '_version') are compatible. Currently this means they must be | |
483 | # identical, with the exception that "X.Y" will match "X.Y.0". | |
484 | # All hashes must define the key '_version'. | |
485 | # | |
486 | # @param a list of pairs of (file name, hash reference) | |
487 | # | |
488 | # @return the version of all the hashes. Upon return, the '_version' | |
489 | # will be removed from all hashes. | |
490 | sub check_versions { | |
491 | my $version = ''; | |
492 | my $msg = ''; | |
493 | foreach my $a (@_) { | |
494 | my $name = $a->[0]; | |
495 | my $h = $a->[1]; | |
496 | die "Error: No version found" unless (exists $h->{'_version'}); | |
497 | my $v = $h->{'_version'}; | |
498 | delete $h->{'_version'}; | |
499 | ||
500 | # append ".0" if necessary, to standardize to X.Y.Z | |
501 | $v .= '.0' unless ($v =~ /\.\d+\./); | |
502 | $v .= '.0' unless ($v =~ /\.\d+\./); | |
503 | $msg .= "$name = $v\n"; | |
504 | if ($version) { | |
505 | die "Error: Mismatched Unicode versions\n$msg" | |
506 | unless ($version eq $v); | |
507 | } else { | |
508 | $version = $v; | |
509 | } | |
510 | } | |
511 | $version; | |
512 | } | |
513 | ||
514 | #---------------------------------------------------------------------- | |
515 | # Make sure the property names in PropertyValueAliases.txt match those | |
516 | # in PropertyAliases.txt. | |
517 | # | |
518 | # @param a hash ref from read_PropertyAliases. | |
519 | # @param a hash ref from read_PropertyValueAliases. | |
520 | sub check_PropertyValueAliases { | |
521 | my ($pa, $va) = @_; | |
522 | ||
523 | # make a reverse hash of short->long | |
524 | my %rev; | |
525 | for (keys %$pa) { $rev{$pa->{$_}} = $_; } | |
526 | ||
527 | for my $prop (keys %$va) { | |
528 | if (!exists $rev{$prop} && !isPseudoProperty($prop)) { | |
529 | print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n"; | |
530 | } | |
531 | } | |
532 | } | |
533 | ||
534 | #---------------------------------------------------------------------- | |
535 | # Merge blocks data into uchar.h enum data. In the 'blk' subhash all | |
536 | # code point values, as returned from read_uchar, are replaced by | |
537 | # block names, as read from Blocks.txt and returned by read_Blocks. | |
538 | # The match must be 1-to-1. If there is any failure of 1-to-1 | |
539 | # mapping, an error is signaled. Upon return, the read_Blocks hash | |
540 | # is emptied of all contents, except for those that failed to match. | |
541 | # | |
542 | # The mapping in the 'blk' subhash, after this function returns, is | |
543 | # from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h | |
544 | # pseudo-name, e.g. "Basic Latin". | |
545 | # | |
546 | # @param a hash ref from read_uchar. | |
547 | # @param a hash ref from read_Blocks. | |
548 | sub merge_Blocks { | |
549 | my ($h, $b) = @_; | |
550 | ||
551 | die "Error: No blocks data in uchar.h" | |
552 | unless (exists $h->{'blk'}); | |
553 | my $blk = $h->{'blk'}; | |
554 | for my $enum (keys %$blk) { | |
555 | my $cp = $blk->{$enum}; | |
556 | if ($cp && !exists $b->{$cp}) { | |
557 | die "Error: No block found at $cp in Blocks.txt"; | |
558 | } | |
559 | # Convert code point to pseudo-name: | |
560 | $blk->{$enum} = $b->{$cp}; | |
561 | delete $b->{$cp}; | |
562 | } | |
563 | my $err = ''; | |
564 | for my $cp (keys %$b) { | |
565 | $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n"; | |
566 | } | |
567 | die $err if ($err); | |
568 | } | |
569 | ||
570 | #---------------------------------------------------------------------- | |
571 | # Merge property alias names into the uchar.h hash. The subhashes | |
572 | # under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are | |
573 | # examined and the values of those subhashes are assumed to be long | |
574 | # names in PropertyAliases.txt. They are validated and replaced by | |
575 | # "<short>|<long>". Upon return, the read_PropertyAliases hash is | |
576 | # emptied of all contents, except for those that failed to match. | |
577 | # Unmatched names in PropertyAliases are listed as a warning but do | |
578 | # NOT cause the script to die. | |
579 | # | |
580 | # @param a hash ref from read_uchar. | |
581 | # @param a hash ref from read_PropertyAliases. | |
582 | # @param a hash mapping long names to property family (e.g., 'binary') | |
583 | sub merge_PropertyAliases { | |
584 | my ($h, $pa, $fam) = @_; | |
585 | ||
586 | for my $k (@TOP) { | |
587 | die "Error: No properties data for $k in uchar.h" | |
588 | unless (exists $h->{$k}); | |
589 | } | |
590 | ||
591 | for my $subh (map { $h->{$_} } @TOP) { | |
592 | for my $enum (keys %$subh) { | |
593 | my $long_name = $subh->{$enum}; | |
594 | if (!exists $pa->{$long_name}) { | |
595 | die "Error: Property $long_name not found (or used more than once)"; | |
596 | } | |
597 | ||
598 | my $value; | |
599 | if($pa->{$long_name} =~ m|^n/a\d*$|) { | |
600 | # replace an "n/a" short name with an empty name (nothing before "|"); | |
601 | # don't remove it (don't remove the "|"): there must always be a long name, | |
602 | # and if the short name is removed, then the long name becomes the | |
603 | # short name and there is no long name left (unless there is another alias) | |
604 | $value = "|" . $long_name; | |
605 | } else { | |
606 | $value = $pa->{$long_name} . "|" . $long_name; | |
607 | } | |
608 | if (exists $additional_property_aliases{$long_name}) { | |
609 | $value .= "|" . $additional_property_aliases{$long_name}; | |
610 | } | |
611 | $subh->{$enum} = $value; | |
612 | delete $pa->{$long_name}; | |
613 | } | |
614 | } | |
615 | ||
616 | my @err; | |
617 | for my $name (keys %$pa) { | |
618 | $MISSING_FROM_UCHAR{$pa->{$name}} = 1; | |
619 | if (exists $UNSUPPORTED{$name}) { | |
620 | push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h"; | |
621 | } elsif (!isIgnoredProperty($name)) { | |
622 | push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h"; | |
623 | } | |
624 | } | |
625 | print join("\n", sort @err), "\n" if (@err); | |
626 | } | |
627 | ||
628 | #---------------------------------------------------------------------- | |
629 | # Return 1 if two names match ignoring whitespace, '-', and '_'. | |
630 | # Used to match names in Blocks.txt with those in PropertyValueAliases.txt | |
631 | # as of Unicode 4.0. | |
632 | sub matchesLoosely { | |
633 | my ($a, $b) = @_; | |
634 | $a =~ s/[\s\-_]//g; | |
635 | $b =~ s/[\s\-_]//g; | |
636 | $a =~ /^$b$/i; | |
637 | } | |
638 | ||
639 | #---------------------------------------------------------------------- | |
640 | # Merge PropertyValueAliases.txt data into the uchar.h hash. All | |
641 | # properties other than blk, _bp, and _ep are analyzed and mapped to | |
642 | # the names listed in PropertyValueAliases. They are then replaced | |
643 | # with a string of the form "<short>|<long>". The short or long name | |
644 | # may be missing. | |
645 | # | |
646 | # @param a hash ref from read_uchar. | |
647 | # @param a hash ref from read_PropertyValueAliases. | |
648 | sub merge_PropertyValueAliases { | |
649 | my ($h, $va) = @_; | |
650 | ||
651 | my %gcCount; | |
652 | for my $prop (keys %$h) { | |
653 | # _bp, _ep handled in merge_PropertyAliases | |
654 | next if ($prop =~ /^_/); | |
655 | ||
656 | # Special case: gcm | |
657 | my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop; | |
658 | ||
659 | # find corresponding PropertyValueAliases data | |
660 | die "Error: Can't find $prop in PropertyValueAliases.txt" | |
661 | unless (exists $va->{$prop2}); | |
662 | my $pva = $va->{$prop2}; | |
663 | ||
664 | # match up data | |
665 | my $hh = $h->{$prop}; | |
666 | for my $enum (keys %$hh) { | |
667 | ||
668 | my $name = $hh->{$enum}; | |
669 | ||
670 | # look up both long and short & ignore case | |
671 | my $n; | |
672 | if (exists $pva->{$name}) { | |
673 | $n = $name; | |
674 | } else { | |
675 | # iterate (slow) | |
676 | for my $a (keys %$pva) { | |
677 | # case-insensitive match | |
678 | # & case-insensitive reverse match | |
679 | if ($a =~ /^$name$/i || | |
680 | $pva->{$a} =~ /^$name$/i) { | |
681 | $n = $a; | |
682 | last; | |
683 | } | |
684 | } | |
685 | } | |
686 | ||
687 | # For blocks, do a loose match from Blocks.txt pseudo-name | |
688 | # to PropertyValueAliases long name. | |
689 | if (!$n && $prop eq 'blk') { | |
690 | for my $a (keys %$pva) { | |
691 | # The block is only going to match the long name, | |
692 | # but we check both for completeness. As of Unicode | |
693 | # 4.0, blocks do not have short names. | |
694 | if (matchesLoosely($name, $pva->{$a}) || | |
695 | matchesLoosely($name, $a)) { | |
696 | $n = $a; | |
697 | last; | |
698 | } | |
699 | } | |
700 | } | |
701 | ||
702 | die "Error: Property value $prop:$name not found" unless ($n); | |
703 | ||
704 | my $l = $n; | |
705 | my $r = $pva->{$n}; | |
706 | # convert |n/a\d*| to blank | |
707 | $l = '' if ($l =~ m|^n/a\d*$|); | |
708 | $r = '' if ($r =~ m|^n/a\d*$|); | |
709 | ||
710 | $hh->{$enum} = "$l|$r"; | |
711 | # Don't delete the 'gc' properties because we need to share | |
712 | # them between 'gc' and 'gcm'. Count each use instead. | |
713 | if ($prop2 eq 'gc') { | |
714 | ++$gcCount{$n}; | |
715 | } else { | |
716 | delete $pva->{$n}; | |
717 | } | |
718 | } | |
719 | } | |
720 | ||
721 | # Merge the combining class values in manually | |
722 | # Add the same values to the synthetic lccc and tccc properties | |
723 | die "Error: No ccc data" | |
724 | unless exists $va->{'ccc'}; | |
725 | for my $ccc (keys %{$va->{'ccc'}}) { | |
726 | die "Error: Can't overwrite ccc $ccc" | |
727 | if (exists $h->{'ccc'}->{$ccc}); | |
728 | $h->{'lccc'}->{$ccc} = | |
729 | $h->{'tccc'}->{$ccc} = | |
730 | $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc}; | |
731 | } | |
732 | delete $va->{'ccc'}; | |
733 | ||
734 | # Merge synthetic binary property values in manually. | |
735 | # These are the "True" and "False" value aliases. | |
736 | die "Error: No True/False value aliases" | |
737 | unless exists $va->{'binprop'}; | |
738 | for my $bp (keys %{$va->{'binprop'}}) { | |
739 | $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp}; | |
740 | } | |
741 | delete $va->{'binprop'}; | |
742 | ||
743 | my $err = ''; | |
744 | for my $prop (sort keys %$va) { | |
745 | my $hh = $va->{$prop}; | |
746 | for my $subkey (sort keys %$hh) { | |
747 | # 'gc' props are shared with 'gcm'; make sure they were used | |
748 | # once or twice. | |
749 | if ($prop eq 'gc') { | |
750 | my $n = $gcCount{$subkey}; | |
751 | next if ($n >= 1 && $n <= 2); | |
752 | } | |
753 | $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n" | |
754 | unless exists $MISSING_FROM_UCHAR{$prop}; | |
755 | } | |
756 | } | |
757 | print $err if ($err); | |
758 | } | |
759 | ||
760 | #---------------------------------------------------------------------- | |
761 | # Read the PropertyAliases.txt file. Return a hash that maps the long | |
762 | # name to the short name. The special key '_version' will map to the | |
763 | # Unicode version of the file. The special key '_family' holds a | |
764 | # subhash that maps long names to a family string, for descriptive | |
765 | # purposes. | |
766 | # | |
767 | # @param a filename for PropertyAliases.txt | |
768 | # @param reference to hash to receive data. Keys are long names. | |
769 | # Values are short names. | |
770 | sub read_PropertyAliases { | |
771 | ||
772 | my $hash = shift; # result | |
773 | ||
774 | my $filename = shift; | |
775 | ||
776 | my $fam = {}; # map long names to family string | |
777 | $fam = $hash->{'_family'} if (exists $hash->{'_family'}); | |
778 | ||
779 | my $family; # binary, enumerated, etc. | |
780 | ||
781 | my $in = new FileHandle($filename, 'r'); | |
782 | die "Error: Cannot open $filename" if (!defined $in); | |
783 | ||
784 | while (<$in>) { | |
785 | ||
786 | # Read version (embedded in a comment) | |
787 | if (/PropertyAliases-(\d+\.\d+\.\d+)/i) { | |
788 | die "Error: Multiple versions in $filename" | |
789 | if (exists $hash->{'_version'}); | |
790 | $hash->{'_version'} = $1; | |
791 | } | |
792 | ||
793 | # Read family heading | |
794 | if (/^\s*\#\s*(.+?)\s*Properties\s*$/) { | |
795 | $family = $1; | |
796 | } | |
797 | ||
798 | # Ignore comments and blank lines | |
799 | s/\#.*//; | |
800 | next unless (/\S/); | |
801 | ||
802 | if (/^\s*(.+?)\s*;/) { | |
803 | my $short = $1; | |
804 | my @fields = /;\s*([^\s;]+)/g; | |
805 | if (@fields < 1 || @fields > 2) { | |
806 | my $number = @fields; | |
807 | die "Error: Wrong number of fields ($number) in $filename at $_"; | |
808 | } | |
809 | ||
810 | # Make "n/a" strings unique | |
811 | if ($short eq 'n/a') { | |
812 | $short .= sprintf("%03d", $propNA++); | |
813 | } | |
814 | my $long = $fields[0]; | |
815 | if ($long eq 'n/a') { | |
816 | $long .= sprintf("%03d", $propNA++); | |
817 | } | |
818 | ||
819 | # Add long name->short name to the hash=pa hash table | |
820 | if (exists $hash->{$long}) { | |
821 | die "Error: Duplicate property $long in $filename" | |
822 | } | |
823 | $hash->{$long} = $short; | |
824 | $fam->{$long} = $family; | |
825 | ||
826 | # Add the list of further aliases to the additional_property_aliases hash table, | |
827 | # using the long property name as the key. | |
828 | # For example: | |
829 | # White_Space->space|outer_space | |
830 | if (@fields > 1) { | |
831 | my $value = pop @fields; | |
832 | while (@fields > 1) { | |
833 | $value .= "|" . pop @fields; | |
834 | } | |
835 | $additional_property_aliases{$long} = $value; | |
836 | } | |
837 | } else { | |
838 | die "Error: Can't parse $_ in $filename"; | |
839 | } | |
840 | } | |
841 | ||
842 | $in->close(); | |
843 | ||
844 | $hash->{'_family'} = $fam; | |
845 | } | |
846 | ||
847 | #---------------------------------------------------------------------- | |
848 | # Read the PropertyValueAliases.txt file. Return a two level hash | |
849 | # that maps property_short_name:value_short_name:value_long_name. In | |
850 | # the case of the 'ccc' property, the short name is the numeric class | |
851 | # and the long name is "<short>|<long>". The special key '_version' | |
852 | # will map to the Unicode version of the file. | |
853 | # | |
854 | # @param a filename for PropertyValueAliases.txt | |
855 | # | |
856 | # @return a hash reference. | |
857 | sub read_PropertyValueAliases { | |
858 | ||
859 | my $hash = shift; # result | |
860 | ||
861 | my $filename = shift; | |
862 | ||
863 | my $in = new FileHandle($filename, 'r'); | |
864 | die "Error: Cannot open $filename" if (!defined $in); | |
865 | ||
866 | while (<$in>) { | |
867 | ||
868 | # Read version (embedded in a comment) | |
869 | if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) { | |
870 | die "Error: Multiple versions in $filename" | |
871 | if (exists $hash->{'_version'}); | |
872 | $hash->{'_version'} = $1; | |
873 | } | |
874 | ||
875 | # Ignore comments and blank lines | |
876 | s/\#.*//; | |
877 | next unless (/\S/); | |
878 | ||
879 | if (/^\s*(.+?)\s*;/i) { | |
880 | my $prop = $1; | |
881 | my @fields = /;\s*([^\s;]+)/g; | |
882 | die "Error: Wrong number of fields in $filename" | |
883 | if (@fields < 2 || @fields > 3); | |
884 | # Make "n/a" strings unique | |
885 | $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a'); | |
886 | # Squash extra fields together | |
887 | while (@fields > 2) { | |
888 | my $f = pop @fields; | |
889 | $fields[$#fields] .= '|' . $f; | |
890 | } | |
891 | addDatum($hash, $prop, @fields); | |
892 | } | |
893 | ||
894 | else { | |
895 | die "Error: Can't parse $_ in $filename"; | |
896 | } | |
897 | } | |
898 | ||
899 | $in->close(); | |
900 | ||
901 | # Script Copt=Qaac (Coptic) is a special case. | |
902 | # Before the Copt code was defined, the private-use code Qaac was used. | |
903 | # Starting with Unicode 4.1, PropertyValueAliases.txt contains | |
904 | # Copt as the short name as well as Qaac as an alias. | |
905 | # For use with older Unicode data files, we add here a Qaac->Coptic entry. | |
906 | # This should not do anything for 4.1-and-later Unicode data files. | |
907 | # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/ | |
908 | $hash->{'sc'}->{'Qaac'} = 'Coptic' | |
909 | unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'}); | |
910 | ||
911 | # Add T|True and F|False -- these are values we recognize for | |
912 | # binary properties (NOT from PropertyValueAliases.txt). These | |
913 | # are of the same form as the 'ccc' value aliases. | |
914 | $hash->{'binprop'}->{'0'} = 'F|False'; | |
915 | $hash->{'binprop'}->{'1'} = 'T|True'; | |
916 | } | |
917 | ||
918 | #---------------------------------------------------------------------- | |
919 | # Read the Blocks.txt file. Return a hash that maps the code point | |
920 | # range start to the block name. The special key '_version' will map | |
921 | # to the Unicode version of the file. | |
922 | # | |
923 | # As of Unicode 4.0, the names in the Blocks.txt are no longer the | |
924 | # proper names. The proper names are now listed in PropertyValueAliases. | |
925 | # They are similar but not identical. Furthermore, 4.0 introduces | |
926 | # a new block name, No_Block, which is listed only in PropertyValueAliases | |
927 | # and not in Blocks.txt. As a result, we handle blocks as follows: | |
928 | # | |
929 | # 1. Read Blocks.txt to map code point range start to quasi-block name. | |
930 | # 2. Add to Blocks.txt a synthetic No Block code point & name: | |
931 | # X -> No Block | |
932 | # 3. Map quasi-names from Blocks.txt (including No Block) to actual | |
933 | # names from PropertyValueAliases. This occurs in | |
934 | # merge_PropertyValueAliases. | |
935 | # | |
936 | # @param a filename for Blocks.txt | |
937 | # | |
938 | # @return a ref to a hash. Keys are code points, as text, e.g., | |
939 | # "1720". Values are pseudo-block names, e.g., "Hanunoo". | |
940 | sub read_Blocks { | |
941 | ||
942 | my $filename = shift; | |
943 | ||
944 | my $hash = {}; # result | |
945 | ||
946 | my $in = new FileHandle($filename, 'r'); | |
947 | die "Error: Cannot open $filename" if (!defined $in); | |
948 | ||
949 | while (<$in>) { | |
950 | ||
951 | # Read version (embedded in a comment) | |
952 | if (/Blocks-(\d+\.\d+\.\d+)/i) { | |
953 | die "Error: Multiple versions in $filename" | |
954 | if (exists $hash->{'_version'}); | |
955 | $hash->{'_version'} = $1; | |
956 | } | |
957 | ||
958 | # Ignore comments and blank lines | |
959 | s/\#.*//; | |
960 | next unless (/\S/); | |
961 | ||
962 | if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) { | |
963 | die "Error: Duplicate range $1 in $filename" | |
964 | if (exists $hash->{$1}); | |
965 | $hash->{$1} = $2; | |
966 | } | |
967 | ||
968 | else { | |
969 | die "Error: Can't parse $_ in $filename"; | |
970 | } | |
971 | } | |
972 | ||
973 | $in->close(); | |
974 | ||
975 | # Add pseudo-name for No Block | |
976 | $hash->{'none'} = 'No Block'; | |
977 | ||
978 | $hash; | |
979 | } | |
980 | ||
981 | #---------------------------------------------------------------------- | |
982 | # Read the uscript.h file and compile a mapping of Unicode symbols to | |
983 | # icu4c enum values. | |
984 | # | |
985 | # @param a filename for uscript.h | |
986 | # | |
987 | # @return a ref to a hash. The keys of the hash are enum symbols from | |
988 | # uscript.h, and the values are script names. | |
989 | sub read_uscript { | |
990 | ||
991 | my $filename = shift; | |
992 | ||
993 | my $mode = ''; # state machine mode and submode | |
994 | my $submode = ''; | |
995 | ||
996 | my $last = ''; # for line folding | |
997 | ||
998 | my $hash = {}; # result | |
999 | my $key; # first-level key | |
1000 | ||
1001 | my $in = new FileHandle($filename, 'r'); | |
1002 | die "Error: Cannot open $filename" if (!defined $in); | |
1003 | ||
1004 | while (<$in>) { | |
1005 | # Fold continued lines together | |
1006 | if (/^(.*)\\$/) { | |
1007 | $last = $1; | |
1008 | next; | |
1009 | } elsif ($last) { | |
1010 | $_ = $last . $_; | |
1011 | $last = ''; | |
1012 | } | |
1013 | ||
1014 | # Exit all modes here | |
1015 | if ($mode && $mode ne 'DEPRECATED') { | |
1016 | if (/^\s*\}/) { | |
1017 | $mode = ''; | |
1018 | next; | |
1019 | } | |
1020 | } | |
1021 | ||
1022 | # Handle individual modes | |
1023 | ||
1024 | if ($mode eq 'UScriptCode') { | |
1025 | if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) { | |
1026 | my ($enum, $code) = ($1, $2); | |
1027 | die "Error: Duplicate script $enum" | |
1028 | if (exists $hash->{$enum}); | |
1029 | $hash->{$enum} = $code; | |
1030 | } | |
1031 | } | |
1032 | ||
1033 | elsif ($mode eq 'DEPRECATED') { | |
1034 | if (/\s*\#ifdef/) { | |
1035 | die "Error: Nested #ifdef"; | |
1036 | } | |
1037 | elsif (/\s*\#endif/) { | |
1038 | $mode = ''; | |
1039 | } | |
1040 | } | |
1041 | ||
1042 | elsif (!$mode) { | |
1043 | if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || | |
1044 | /^\s*typedef\s+enum\s+(\w+)\s*$/) { | |
1045 | $mode = $1; | |
1046 | #print "Parsing $mode\n"; | |
1047 | } | |
1048 | ||
1049 | elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { | |
1050 | $mode = 'DEPRECATED'; | |
1051 | } | |
1052 | } | |
1053 | } | |
1054 | ||
1055 | $in->close(); | |
1056 | ||
1057 | $hash; | |
1058 | } | |
1059 | ||
1060 | #---------------------------------------------------------------------- | |
1061 | # Read the uchar.h file and compile a mapping of Unicode symbols to | |
1062 | # icu4c enum values. | |
1063 | # | |
1064 | # @param a filename for uchar.h | |
1065 | # | |
1066 | # @return a ref to a hash. The keys of the hash are '_bp' for binary | |
1067 | # properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for | |
1068 | # double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk', | |
1069 | # 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property | |
1070 | # value aliases. The values of the hash are subhashes. The subhashes | |
1071 | # have a key of the uchar.h enum symbol, and a value of the alias | |
1072 | # string (as listed in PropertyValueAliases.txt). NOTE: The alias | |
1073 | # string is whatever alias uchar.h lists. This may be either short or | |
1074 | # long, depending on the specific enum. NOTE: For blocks ('blk'), the | |
1075 | # value is a hex code point for the start of the associated block. | |
1076 | # NOTE: The special key _version will map to the Unicode version of | |
1077 | # the file. | |
1078 | sub read_uchar { | |
1079 | ||
1080 | my $filename = shift; | |
1081 | ||
1082 | my $mode = ''; # state machine mode and submode | |
1083 | my $submode = ''; | |
1084 | ||
1085 | my $last = ''; # for line folding | |
1086 | ||
1087 | my $hash = {}; # result | |
1088 | my $key; # first-level key | |
1089 | ||
1090 | my $in = new FileHandle($filename, 'r'); | |
1091 | die "Error: Cannot open $filename" if (!defined $in); | |
1092 | ||
1093 | while (<$in>) { | |
1094 | # Fold continued lines together | |
1095 | if (/^(.*)\\$/) { | |
1096 | $last .= $1; | |
1097 | next; | |
1098 | } elsif ($last) { | |
1099 | $_ = $last . $_; | |
1100 | $last = ''; | |
1101 | } | |
1102 | ||
1103 | # Exit all modes here | |
1104 | if ($mode && $mode ne 'DEPRECATED') { | |
1105 | if (/^\s*\}/) { | |
1106 | $mode = ''; | |
1107 | next; | |
1108 | } | |
1109 | } | |
1110 | ||
1111 | # Handle individual modes | |
1112 | ||
1113 | if ($mode eq 'UProperty') { | |
1114 | if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) { | |
1115 | if ($submode) { | |
1116 | addDatum($hash, $key, $1, $submode); | |
1117 | $submode = ''; | |
1118 | } else { | |
1119 | #print "Warning: Ignoring $1\n"; | |
1120 | } | |
1121 | } | |
1122 | ||
1123 | elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) { | |
1124 | die "Error: Unmatched tag $submode" if ($submode); | |
1125 | die "Error: Unrecognized UProperty comment: $_" | |
1126 | unless (exists $PROP_TYPE{$1}); | |
1127 | $key = $PROP_TYPE{$1}; | |
1128 | $submode = $2; | |
1129 | } | |
1130 | } | |
1131 | ||
1132 | elsif ($mode eq 'UCharCategory') { | |
1133 | if (/^\s*(U_\w+)\s*=/) { | |
1134 | if ($submode) { | |
1135 | addDatum($hash, 'gc', $1, $submode); | |
1136 | $submode = ''; | |
1137 | } else { | |
1138 | #print "Warning: Ignoring $1\n"; | |
1139 | } | |
1140 | } | |
1141 | ||
1142 | elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) { | |
1143 | die "Error: Unmatched tag $submode" if ($submode); | |
1144 | $submode = $1; | |
1145 | } | |
1146 | } | |
1147 | ||
1148 | elsif ($mode eq 'UCharDirection') { | |
1149 | if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) { | |
1150 | if ($submode) { | |
1151 | addDatum($hash, $key, $1, $submode); | |
1152 | $submode = ''; | |
1153 | } else { | |
1154 | #print "Warning: Ignoring $1\n"; | |
1155 | } | |
1156 | } | |
1157 | ||
1158 | elsif (m|/\*\*\s*([A-Z]+)\s|) { | |
1159 | die "Error: Unmatched tag $submode" if ($submode); | |
1160 | $key = 'bc'; | |
1161 | $submode = $1; | |
1162 | } | |
1163 | } | |
1164 | ||
1165 | elsif ($mode eq 'UBlockCode') { | |
1166 | if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) { | |
1167 | addDatum($hash, 'blk', $1, $2); | |
1168 | } | |
1169 | } | |
1170 | ||
1171 | elsif ($mode eq 'UEastAsianWidth') { | |
1172 | if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) { | |
1173 | addDatum($hash, 'ea', $1, $2); | |
1174 | } | |
1175 | } | |
1176 | ||
1177 | elsif ($mode eq 'UDecompositionType') { | |
1178 | if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) { | |
1179 | addDatum($hash, 'dt', $1, $2); | |
1180 | } | |
1181 | } | |
1182 | ||
1183 | elsif ($mode eq 'UJoiningType') { | |
1184 | if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) { | |
1185 | addDatum($hash, 'jt', $1, $2); | |
1186 | } | |
1187 | } | |
1188 | ||
1189 | elsif ($mode eq 'UJoiningGroup') { | |
1190 | if (/^\s*(U_JG_(\w+))/) { | |
1191 | addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT'); | |
1192 | } | |
1193 | } | |
1194 | ||
1195 | elsif ($mode eq 'UGraphemeClusterBreak') { | |
1196 | if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) { | |
1197 | addDatum($hash, 'GCB', $1, $2); | |
1198 | } | |
1199 | } | |
1200 | ||
1201 | elsif ($mode eq 'UWordBreakValues') { | |
1202 | if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) { | |
1203 | addDatum($hash, 'WB', $1, $2); | |
1204 | } | |
1205 | } | |
1206 | ||
1207 | elsif ($mode eq 'USentenceBreak') { | |
1208 | if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) { | |
1209 | addDatum($hash, 'SB', $1, $2); | |
1210 | } | |
1211 | } | |
1212 | ||
1213 | elsif ($mode eq 'ULineBreak') { | |
1214 | if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) { | |
1215 | addDatum($hash, 'lb', $1, $2); | |
1216 | } | |
1217 | } | |
1218 | ||
1219 | elsif ($mode eq 'UNumericType') { | |
1220 | if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) { | |
1221 | addDatum($hash, 'nt', $1, $2); | |
1222 | } | |
1223 | } | |
1224 | ||
1225 | elsif ($mode eq 'UHangulSyllableType') { | |
1226 | if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) { | |
1227 | addDatum($hash, 'hst', $1, $2); | |
1228 | } | |
1229 | } | |
1230 | ||
1231 | elsif ($mode eq 'DEPRECATED') { | |
1232 | if (/\s*\#ifdef/) { | |
1233 | die "Error: Nested #ifdef"; | |
1234 | } | |
1235 | elsif (/\s*\#endif/) { | |
1236 | $mode = ''; | |
1237 | } | |
1238 | } | |
1239 | ||
1240 | elsif (!$mode) { | |
1241 | if (/^\s*\#define\s+(\w+)\s+(.+)/) { | |
1242 | # #define $left $right | |
1243 | my ($left, $right) = ($1, $2); | |
1244 | ||
1245 | if ($left eq 'U_UNICODE_VERSION') { | |
1246 | my $version = $right; | |
1247 | $version = $1 if ($version =~ /^\"(.*)\"/); | |
1248 | # print "Unicode version: ", $version, "\n"; | |
1249 | die "Error: Multiple versions in $filename" | |
1250 | if (defined $hash->{'_version'}); | |
1251 | $hash->{'_version'} = $version; | |
1252 | } | |
1253 | ||
1254 | elsif ($left =~ /U_GC_(\w+?)_MASK/) { | |
1255 | addDatum($hash, 'gcm', $left, $1); | |
1256 | } | |
1257 | } | |
1258 | ||
1259 | elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || | |
1260 | /^\s*typedef\s+enum\s+(\w+)\s*$/) { | |
1261 | $mode = $1; | |
1262 | #print "Parsing $mode\n"; | |
1263 | } | |
1264 | ||
1265 | elsif (/^\s*enum\s+(\w+)\s*\{/ || | |
1266 | /^\s*enum\s+(\w+)\s*$/) { | |
1267 | $mode = $1; | |
1268 | #print "Parsing $mode\n"; | |
1269 | } | |
1270 | ||
1271 | elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { | |
1272 | $mode = 'DEPRECATED'; | |
1273 | } | |
1274 | } | |
1275 | } | |
1276 | ||
1277 | $in->close(); | |
1278 | ||
1279 | # hardcode known values for the normalization quick check properties | |
1280 | # see unorm.h for the UNormalizationCheckResult enum | |
1281 | ||
1282 | addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N'); | |
1283 | addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y'); | |
1284 | addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M'); | |
1285 | ||
1286 | addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N'); | |
1287 | addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y'); | |
1288 | addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M'); | |
1289 | ||
1290 | # no "maybe" values for NF[K]D | |
1291 | ||
1292 | addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N'); | |
1293 | addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y'); | |
1294 | ||
1295 | addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N'); | |
1296 | addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y'); | |
1297 | ||
1298 | $hash; | |
1299 | } | |
1300 | ||
1301 | #---------------------------------------------------------------------- | |
1302 | # Add a new value to a two-level hash. That is, given a ref to | |
1303 | # a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value. | |
1304 | sub addDatum { | |
1305 | my ($h, $k1, $k2, $v) = @_; | |
1306 | if (exists $h->{$k1}->{$k2}) { | |
1307 | die "Error: $k1:$k2 already set to " . | |
1308 | $h->{$k1}->{$k2} . ", cannot set to " . $v; | |
1309 | } | |
1310 | $h->{$k1}->{$k2} = $v; | |
1311 | } | |
1312 | ||
1313 | #eof |