]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | #!/bin/perl -w |
2 | #******************************************************************* | |
3 | # COPYRIGHT: | |
4 | # Copyright (c) 2002-2003, International Business Machines Corporation and | |
5 | # others. All Rights Reserved. | |
6 | #******************************************************************* | |
7 | ||
8 | # This script reads in UCD files PropertyAliases.txt and | |
9 | # PropertyValueAliases.txt and correlates them with ICU enums | |
10 | # defined in uchar.h and uscript.h. It then outputs a header | |
11 | # file which contains all names and enums. The header is included | |
12 | # by the genpname tool C++ source file, which produces the actual | |
13 | # binary data file. | |
14 | # | |
15 | # See usage note below. | |
16 | # | |
17 | # TODO: The Property[Value]Alias.txt files state that they can support | |
18 | # more than 2 names per property|value. Currently (Unicode 3.2) there | |
19 | # are always 1 or 2 names. If more names were supported, presumably | |
20 | # the format would be something like: | |
21 | # nv ; Numeric_Value | |
22 | # nv ; Value_Numerique | |
23 | # CURRENTLY, this script assumes that there are 1 or two names. Any | |
24 | # duplicates it sees are flagged as an error. If multiple aliases | |
25 | # appear in a future version of Unicode, modify this script to support | |
26 | # that. | |
27 | # | |
28 | # NOTE: As of ICU 2.6, this script has been modified to know about the | |
29 | # pseudo-property gcm/General_Category_Mask, which corresponds to the | |
30 | # uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property | |
31 | # corresponds to General_Category but is a bitmask value. It does not | |
32 | # exist in the UCD. Therefore, I special case it in several places | |
33 | # (search for General_Category_Mask and gcm). | |
34 | # | |
35 | # NOTE: As of ICU 2.6, this script reads an auxiliary data file, | |
36 | # SyntheticPropertyAliases.txt, containing property aliases not | |
37 | # present in the UCD but present in ICU. This file resides in the | |
38 | # same directory as this script. Its contents are merged into those | |
39 | # of PropertyAliases.txt as if the two files were appended. | |
40 | # | |
41 | # NOTE: The following names are handled specially. See script below | |
42 | # for details. | |
43 | # | |
44 | # T/True | |
45 | # F/False | |
46 | # No_Block | |
47 | # | |
48 | # Author: Alan Liu | |
49 | # Created: October 14 2002 | |
50 | # Since: ICU 2.4 | |
51 | ||
52 | use FileHandle; | |
53 | use strict; | |
54 | use Dumpvalue; | |
55 | ||
56 | my $DEBUG = 0; | |
57 | my $DUMPER = new Dumpvalue; | |
58 | ||
59 | my $count = @ARGV; | |
60 | my $ICU_DIR = shift() || ''; | |
61 | my $OUT_FILE = shift() || 'data.h'; | |
62 | my $HEADER_DIR = "$ICU_DIR/source/common/unicode"; | |
63 | my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata"; | |
64 | ||
65 | # Get the current year from the system | |
66 | my $YEAR = 1900+@{[localtime]}[5]; # Get the current year | |
67 | ||
68 | #---------------------------------------------------------------------- | |
69 | # Top level property keys for binary, enumerated, string, and double props | |
70 | my @TOP = qw( _bp _ep _sp _dp _mp ); | |
71 | ||
72 | # This hash governs how top level properties are grouped into output arrays. | |
73 | #my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ], | |
74 | # "NO_VALUE" => [ '_sp', '_dp' ] );m | |
75 | #my %TOP_PROPS = ( "BINARY" => [ '_bp' ], | |
76 | # "ENUMERATED" => [ '_ep' ], | |
77 | # "STRING" => [ '_sp' ], | |
78 | # "DOUBLE" => [ '_dp' ] ); | |
79 | my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] ); | |
80 | ||
81 | my %PROP_TYPE = (Binary => "_bp", | |
82 | String => "_sp", | |
83 | Double => "_dp", | |
84 | Enumerated => "_ep", | |
85 | Bitmask => "_mp"); | |
86 | #---------------------------------------------------------------------- | |
87 | ||
88 | # Properties that are unsupported in ICU | |
89 | my %UNSUPPORTED = (Composition_Exclusion => 1, | |
90 | Decomposition_Mapping => 1, | |
91 | Expands_On_NFC => 1, | |
92 | Expands_On_NFD => 1, | |
93 | Expands_On_NFKC => 1, | |
94 | Expands_On_NFKD => 1, | |
95 | FC_NFKC_Closure => 1, | |
96 | ID_Start_Exceptions => 1, | |
97 | NFC_Quick_Check => 1, | |
98 | NFD_Quick_Check => 1, | |
99 | NFKC_Quick_Check => 1, | |
100 | NFKD_Quick_Check => 1, | |
101 | Special_Case_Condition => 1, | |
102 | ); | |
103 | ||
104 | # Short names of properties that weren't seen in uchar.h. If the | |
105 | # properties weren't seen, don't complain about the property values | |
106 | # missing. | |
107 | my %MISSING_FROM_UCHAR; | |
108 | ||
109 | #---------------------------------------------------------------------- | |
110 | ||
111 | # Emitted class names | |
112 | my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property); | |
113 | ||
114 | if ($count < 1 || $count > 2 || | |
115 | !-d $HEADER_DIR || | |
116 | !-d $UNIDATA_DIR) { | |
117 | my $me = $0; | |
118 | $me =~ s|.+[/\\]||; | |
119 | my $lm = ' ' x length($me); | |
120 | print <<"END"; | |
121 | ||
122 | $me: Reads ICU4C headers and Unicode data files and creates | |
123 | $lm a C header file that is included by genpname. The header | |
124 | $lm file matches constants defined in the ICU4C headers with | |
125 | $lm property|value aliases in the Unicode data files. | |
126 | ||
127 | Usage: $me <icu_dir> [<out_file>] | |
128 | ||
129 | <icu_dir> ICU4C root directory, containing | |
130 | source/common/unicode/uchar.h | |
131 | source/common/unicode/uscript.h | |
132 | source/data/unidata/Blocks.txt | |
133 | source/data/unidata/PropertyAliases.txt | |
134 | source/data/unidata/PropertyValueAliases.txt | |
135 | <out_file> File name of header to be written; | |
136 | default is 'data.h'. | |
137 | ||
138 | The Unicode versions of all input files must match. | |
139 | END | |
140 | exit(1); | |
141 | } | |
142 | ||
143 | my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR); | |
144 | ||
145 | if ($DEBUG) { | |
146 | print "Merged hash:\n"; | |
147 | for my $key (sort keys %$h) { | |
148 | my $hh = $h->{$key}; | |
149 | for my $subkey (sort keys %$hh) { | |
150 | print "$key:$subkey:", $hh->{$subkey}, "\n"; | |
151 | } | |
152 | } | |
153 | } | |
154 | ||
155 | my $out = new FileHandle($OUT_FILE, 'w'); | |
156 | die "Error: Can't write to $OUT_FILE: $!" unless (defined $out); | |
157 | my $save = select($out); | |
158 | formatData($h, $version); | |
159 | select($save); | |
160 | $out->close(); | |
161 | ||
162 | exit(0); | |
163 | ||
164 | #---------------------------------------------------------------------- | |
165 | # From PropList.html: "The properties of the form Other_XXX | |
166 | # are used to generate properties in DerivedCoreProperties.txt. | |
167 | # They are not intended for general use, such as in APIs that | |
168 | # return property values. | |
169 | # Non_Break is not a valid property as of 3.2. | |
170 | sub isIgnoredProperty { | |
171 | local $_ = shift; | |
172 | /^Other_/i || /^Non_Break$/i; | |
173 | } | |
174 | ||
175 | # 'qc' is a pseudo-property matching any quick-check property | |
176 | # see PropertyValueAliases.txt file comments. 'binprop' is | |
177 | # a synthetic binary value alias "True"/"False", not present | |
178 | # in PropertyValueAliases.txt. | |
179 | sub isPseudoProperty { | |
180 | $_[0] eq 'qc' || | |
181 | $_[0] eq 'binprop'; | |
182 | } | |
183 | ||
184 | #---------------------------------------------------------------------- | |
185 | # Emit the combined data from headers and the Unicode database as a | |
186 | # C source code header file. | |
187 | # | |
188 | # @param ref to hash with the data | |
189 | # @param Unicode version, as a string | |
190 | sub formatData { | |
191 | my $h = shift; | |
192 | my $version = shift; | |
193 | ||
194 | my $date = scalar localtime(); | |
195 | print <<"END"; | |
196 | /** | |
197 | * Copyright (C) 2002-$YEAR, International Business Machines Corporation and | |
198 | * others. All Rights Reserved. | |
199 | * | |
200 | * MACHINE GENERATED FILE. !!! Do not edit manually !!! | |
201 | * | |
202 | * Generated from | |
203 | * uchar.h | |
204 | * uscript.h | |
205 | * Blocks.txt | |
206 | * PropertyAliases.txt | |
207 | * PropertyValueAliases.txt | |
208 | * | |
209 | * Date: $date | |
210 | * Unicode version: $version | |
211 | * Script: $0 | |
212 | */ | |
213 | ||
214 | END | |
215 | ||
216 | #------------------------------------------------------------ | |
217 | # Emit Unicode version | |
218 | print "/* Unicode version $version */\n"; | |
219 | my @v = split(/\./, $version); | |
220 | push @v, '0' while (@v < 4); | |
221 | for (my $i=0; $i<@v; ++$i) { | |
222 | print "const uint8_t VERSION_$i = $v[$i];\n"; | |
223 | } | |
224 | print "\n"; | |
225 | ||
226 | #------------------------------------------------------------ | |
227 | # Emit String table | |
228 | # [A table of all identifiers, that is, all long or short property | |
229 | # or value names. The list need NOT be sorted; it will be sorted | |
230 | # by the C program. Strings are referenced by their index into | |
231 | # this table. After sorting, a REMAP[] array is used to map the | |
232 | # old position indices to the new positions.] | |
233 | my %strings; | |
234 | for my $prop (sort keys %$h) { | |
235 | my $hh = $h->{$prop}; | |
236 | for my $enum (sort keys %$hh) { | |
237 | my @a = split(/\|/, $hh->{$enum}); | |
238 | for (@a) { | |
239 | $strings{$_} = 1 if (length($_)); | |
240 | } | |
241 | } | |
242 | } | |
243 | my @strings = sort keys %strings; | |
244 | unshift @strings, ""; | |
245 | ||
246 | print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n"; | |
247 | ||
248 | # while printing, create a mapping hash from string table entry to index | |
249 | my %stringToID; | |
250 | print "/* to be sorted */\n"; | |
251 | print "const $STRING_CLASS STRING_TABLE[] = {\n"; | |
252 | for (my $i=0; $i<@strings; ++$i) { | |
253 | print " $STRING_CLASS(\"$strings[$i]\", $i),\n"; | |
254 | $stringToID{$strings[$i]} = $i; | |
255 | } | |
256 | print "};\n\n"; | |
257 | ||
258 | # placeholder for the remapping index. this is used to map | |
259 | # indices that we compute here to indices of the sorted | |
260 | # STRING_TABLE. STRING_TABLE will be sorted by the C++ program | |
261 | # using the uprv_comparePropertyNames() function. this will | |
262 | # reshuffle the order. we then use the indices (passed to the | |
263 | # String constructor) to create a REMAP[] array. | |
264 | print "/* to be filled in */\n"; | |
265 | print "int32_t REMAP[", scalar @strings, "];\n\n"; | |
266 | ||
267 | #------------------------------------------------------------ | |
268 | # Emit the name group table | |
269 | # [A table of name groups. A name group is one or more names | |
270 | # for a property or property value. The Unicode data files specify | |
271 | # that there may be more than 2, although as of Unicode 3.2 there | |
272 | # are at most 2. The name group table looks like this: | |
273 | # | |
274 | # 114, -115, 116, -117, 0, -118, 65, -64, ... | |
275 | # [0] [2] [4] [6] | |
276 | # | |
277 | # The entry at [0] consists of 2 strings, 114 and 115. | |
278 | # The entry at [2] consists of 116 and 117. The entry at | |
279 | # [4] is one string, 118. There is always at least one | |
280 | # string; typically there are two. If there are two, the first | |
281 | # is the SHORT name and the second is the LONG. If there is | |
282 | # one, then the missing entry (always the short name, in 3.2) | |
283 | # is zero, which is by definition the index of "". The | |
284 | # 'preferred' name will generally be the LONG name, if there are | |
285 | # more than 2 entries. The last entry is negative. | |
286 | ||
287 | # Build name group list and replace string refs with nameGroup indices | |
288 | my @nameGroups; | |
289 | ||
290 | # Check for duplicate name groups, and reuse them if possible | |
291 | my %groupToInt; # Map group strings to ints | |
292 | for my $prop (sort keys %$h) { | |
293 | my $hh = $h->{$prop}; | |
294 | for my $enum (sort keys %$hh) { | |
295 | my $groupString = $hh->{$enum}; | |
296 | my $i; | |
297 | if (exists $groupToInt{$groupString}) { | |
298 | $i = $groupToInt{$groupString}; | |
299 | } else { | |
300 | my @names = split(/\|/, $groupString); | |
301 | die "Error: Wrong number of names in " . $groupString if (@names != 2); | |
302 | $i = @nameGroups; # index of group we are making | |
303 | $groupToInt{$groupString} = $i; # Cache for reuse | |
304 | push @nameGroups, map { $stringToID{$_} } @names; | |
305 | $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end | |
306 | } | |
307 | # now, replace string list with ref to name group | |
308 | $hh->{$enum} = $i; | |
309 | } | |
310 | } | |
311 | ||
312 | print "const int32_t NAME_GROUP_COUNT = ", | |
313 | scalar @nameGroups, ";\n\n"; | |
314 | ||
315 | print "int32_t NAME_GROUP[] = {\n"; | |
316 | # emit one group per line, with annotations | |
317 | for (my $i=0; $i<@nameGroups; ) { | |
318 | my @a; | |
319 | my $line; | |
320 | my $start = $i; | |
321 | for (;;) { | |
322 | my $j = $nameGroups[$i++]; | |
323 | $line .= "$j, "; | |
324 | push @a, abs($j); | |
325 | last if ($j < 0); | |
326 | } | |
327 | print " ", | |
328 | $line, | |
329 | ' 'x(20-length($line)), | |
330 | "/* ", sprintf("%3d", $start), | |
331 | ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n"; | |
332 | } | |
333 | print "};\n\n"; | |
334 | ||
335 | # This is fixed for 3.2 at "2" but should be calculated dynamically | |
336 | # when more than 2 names appear in Property[Value]Aliases.txt. | |
337 | print "#define MAX_NAMES_PER_GROUP 2\n\n"; | |
338 | ||
339 | #------------------------------------------------------------ | |
340 | # Emit enumerated property values | |
341 | for my $prop (sort keys %$h) { | |
342 | next if ($prop =~ /^_/); | |
343 | my $vh = $h->{$prop}; | |
344 | my $count = scalar keys %$vh; | |
345 | ||
346 | print "const int32_t VALUES_${prop}_COUNT = ", | |
347 | $count, ";\n\n"; | |
348 | ||
349 | print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n"; | |
350 | for my $enum (sort keys %$vh) { | |
351 | #my @names = split(/\|/, $vh->{$enum}); | |
352 | #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]" | |
353 | # if (@names != 2); | |
354 | print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n"; | |
355 | #$stringToID{$names[0]}, ", ", | |
356 | #$stringToID{$names[1]}, "),\n"; | |
357 | # "\"", $names[0], "\", ", | |
358 | # "\"", $names[1], "\"),\n"; | |
359 | } | |
360 | print "};\n\n"; | |
361 | } | |
362 | ||
363 | #------------------------------------------------------------ | |
364 | # Emit top-level properties (binary, enumerated, etc.) | |
365 | for my $topName (sort keys %TOP_PROPS) { | |
366 | my $a = $TOP_PROPS{$topName}; | |
367 | my $count = 0; | |
368 | for my $type (@$a) { # "_bp", "_ep", etc. | |
369 | $count += scalar keys %{$h->{$type}}; | |
370 | } | |
371 | ||
372 | print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n"; | |
373 | ||
374 | print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n"; | |
375 | ||
376 | for my $type (@$a) { # "_bp", "_ep", etc. | |
377 | my $p = $h->{$type}; | |
378 | ||
379 | for my $enum (sort keys %$p) { | |
380 | my $name = $strings[$nameGroups[$p->{$enum}]]; | |
381 | ||
382 | my $valueRef = "0, NULL"; | |
383 | if ($type eq '_bp') { | |
384 | $valueRef = "VALUES_binprop_COUNT, VALUES_binprop"; | |
385 | } | |
386 | elsif (exists $h->{$name}) { | |
387 | $valueRef = "VALUES_${name}_COUNT, VALUES_$name"; | |
388 | } | |
389 | ||
390 | print " $PROPERTY_CLASS((int32_t) $enum, ", | |
391 | $p->{$enum}, ", $valueRef),\n"; | |
392 | } | |
393 | } | |
394 | print "};\n\n"; | |
395 | } | |
396 | ||
397 | print "/*eof*/\n"; | |
398 | } | |
399 | ||
400 | #---------------------------------------------------------------------- | |
401 | # Read in the files uchar.h, uscript.h, Blocks.txt, | |
402 | # PropertyAliases.txt, and PropertyValueAliases.txt, | |
403 | # and combine them into one hash. | |
404 | # | |
405 | # @param directory containing headers | |
406 | # @param directory containin Unicode data files | |
407 | # | |
408 | # @return hash ref, Unicode version | |
409 | sub readAndMerge { | |
410 | ||
411 | my ($headerDir, $unidataDir) = @_; | |
412 | ||
413 | my $h = read_uchar("$headerDir/uchar.h"); | |
414 | my $s = read_uscript("$headerDir/uscript.h"); | |
415 | my $b = read_Blocks("$unidataDir/Blocks.txt"); | |
416 | my $pa = {}; | |
417 | read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt"); | |
418 | read_PropertyAliases($pa, "SyntheticPropertyAliases.txt"); | |
419 | my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt"); | |
420 | ||
421 | # Extract property family hash | |
422 | my $fam = $pa->{'_family'}; | |
423 | delete $pa->{'_family'}; | |
424 | ||
425 | # Note: uscript.h has no version string, so don't check it | |
426 | my $version = check_versions([ 'uchar.h', $h ], | |
427 | [ 'Blocks.txt', $b ], | |
428 | [ 'PropertyAliases.txt', $pa ], | |
429 | [ 'PropertyValueAliases.txt', $va ]); | |
430 | ||
431 | # Do this BEFORE merging; merging modifies the hashes | |
432 | check_PropertyValueAliases($pa, $va); | |
433 | ||
434 | # Dump out the $va hash for debugging | |
435 | if ($DEBUG) { | |
436 | print "Property values hash:\n"; | |
437 | for my $key (sort keys %$va) { | |
438 | my $hh = $va->{$key}; | |
439 | for my $subkey (sort keys %$hh) { | |
440 | print "$key:$subkey:", $hh->{$subkey}, "\n"; | |
441 | } | |
442 | } | |
443 | } | |
444 | ||
445 | # Dump out the $s hash for debugging | |
446 | if ($DEBUG) { | |
447 | print "Script hash:\n"; | |
448 | for my $key (sort keys %$s) { | |
449 | print "$key:", $s->{$key}, "\n"; | |
450 | } | |
451 | } | |
452 | ||
453 | # Link in the script data | |
454 | $h->{'sc'} = $s; | |
455 | ||
456 | merge_Blocks($h, $b); | |
457 | ||
458 | merge_PropertyAliases($h, $pa, $fam); | |
459 | ||
460 | merge_PropertyValueAliases($h, $va); | |
461 | ||
462 | ($h, $version); | |
463 | } | |
464 | ||
465 | #---------------------------------------------------------------------- | |
466 | # Ensure that the version strings in the given hashes (under the key | |
467 | # '_version') are compatible. Currently this means they must be | |
468 | # identical, with the exception that "X.Y" will match "X.Y.0". | |
469 | # All hashes must define the key '_version'. | |
470 | # | |
471 | # @param a list of pairs of (file name, hash reference) | |
472 | # | |
473 | # @return the version of all the hashes. Upon return, the '_version' | |
474 | # will be removed from all hashes. | |
475 | sub check_versions { | |
476 | my $version = ''; | |
477 | my $msg = ''; | |
478 | foreach my $a (@_) { | |
479 | my $name = $a->[0]; | |
480 | my $h = $a->[1]; | |
481 | die "Error: No version found" unless (exists $h->{'_version'}); | |
482 | my $v = $h->{'_version'}; | |
483 | delete $h->{'_version'}; | |
484 | ||
485 | # append ".0" if necessary, to standardize to X.Y.Z | |
486 | $v .= '.0' unless ($v =~ /\.\d+\./); | |
487 | $v .= '.0' unless ($v =~ /\.\d+\./); | |
488 | $msg .= "$name = $v\n"; | |
489 | if ($version) { | |
490 | die "Error: Mismatched Unicode versions\n$msg" | |
491 | unless ($version eq $v); | |
492 | } else { | |
493 | $version = $v; | |
494 | } | |
495 | } | |
496 | $version; | |
497 | } | |
498 | ||
499 | #---------------------------------------------------------------------- | |
500 | # Make sure the property names in PropertyValueAliases.txt match those | |
501 | # in PropertyAliases.txt. | |
502 | # | |
503 | # @param a hash ref from read_PropertyAliases. | |
504 | # @param a hash ref from read_PropertyValueAliases. | |
505 | sub check_PropertyValueAliases { | |
506 | my ($pa, $va) = @_; | |
507 | ||
508 | # make a reverse hash of short->long | |
509 | my %rev; | |
510 | for (keys %$pa) { $rev{$pa->{$_}} = $_; } | |
511 | ||
512 | for my $prop (keys %$va) { | |
513 | if (!exists $rev{$prop} && !isPseudoProperty($prop)) { | |
514 | print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n"; | |
515 | } | |
516 | } | |
517 | } | |
518 | ||
519 | #---------------------------------------------------------------------- | |
520 | # Merge blocks data into uchar.h enum data. In the 'blk' subhash all | |
521 | # code point values, as returned from read_uchar, are replaced by | |
522 | # block names, as read from Blocks.txt and returned by read_Blocks. | |
523 | # The match must be 1-to-1. If there is any failure of 1-to-1 | |
524 | # mapping, an error is signaled. Upon return, the read_Blocks hash | |
525 | # is emptied of all contents, except for those that failed to match. | |
526 | # | |
527 | # The mapping in the 'blk' subhash, after this function returns, is | |
528 | # from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h | |
529 | # pseudo-name, e.g. "Basic Latin". | |
530 | # | |
531 | # @param a hash ref from read_uchar. | |
532 | # @param a hash ref from read_Blocks. | |
533 | sub merge_Blocks { | |
534 | my ($h, $b) = @_; | |
535 | ||
536 | die "Error: No blocks data in uchar.h" | |
537 | unless (exists $h->{'blk'}); | |
538 | my $blk = $h->{'blk'}; | |
539 | for my $enum (keys %$blk) { | |
540 | my $cp = $blk->{$enum}; | |
541 | if ($cp && !exists $b->{$cp}) { | |
542 | die "Error: No block found at $cp in Blocks.txt"; | |
543 | } | |
544 | # Convert code point to pseudo-name: | |
545 | $blk->{$enum} = $b->{$cp}; | |
546 | delete $b->{$cp}; | |
547 | } | |
548 | my $err = ''; | |
549 | for my $cp (keys %$b) { | |
550 | $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n"; | |
551 | } | |
552 | die $err if ($err); | |
553 | } | |
554 | ||
555 | #---------------------------------------------------------------------- | |
556 | # Merge property alias names into the uchar.h hash. The subhashes | |
557 | # under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are | |
558 | # examined and the values of those subhashes are assumed to be long | |
559 | # names in PropertyAliases.txt. They are validated and replaced by | |
560 | # "<short>|<long>". Upon return, the read_PropertyAliases hash is | |
561 | # emptied of all contents, except for those that failed to match. | |
562 | # Unmatched names in PropertyAliases are listed as a warning but do | |
563 | # NOT cause the script to die. | |
564 | # | |
565 | # @param a hash ref from read_uchar. | |
566 | # @param a hash ref from read_PropertyAliases. | |
567 | # @param a hash mapping long names to property family (e.g., 'binary') | |
568 | sub merge_PropertyAliases { | |
569 | my ($h, $pa, $fam) = @_; | |
570 | ||
571 | for my $k (@TOP) { | |
572 | die "Error: No properties data for $k in uchar.h" | |
573 | unless (exists $h->{$k}); | |
574 | } | |
575 | ||
576 | for my $subh (map { $h->{$_} } @TOP) { | |
577 | for my $enum (keys %$subh) { | |
578 | my $name = $subh->{$enum}; | |
579 | die "Error: Property $name not found (or used more than once)" | |
580 | unless (exists $pa->{$name}); | |
581 | ||
582 | $subh->{$enum} = $pa->{$name} . "|" . $name; | |
583 | delete $pa->{$name}; | |
584 | } | |
585 | } | |
586 | my @err; | |
587 | for my $name (keys %$pa) { | |
588 | $MISSING_FROM_UCHAR{$pa->{$name}} = 1; | |
589 | if (exists $UNSUPPORTED{$name}) { | |
590 | push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h"; | |
591 | } elsif (!isIgnoredProperty($name)) { | |
592 | push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h"; | |
593 | } | |
594 | } | |
595 | print join("\n", sort @err), "\n" if (@err); | |
596 | } | |
597 | ||
598 | #---------------------------------------------------------------------- | |
599 | # Return 1 if two names match ignoring whitespace, '-', and '_'. | |
600 | # Used to match names in Blocks.txt with those in PropertyValueAliases.txt | |
601 | # as of Unicode 4.0. | |
602 | sub matchesLoosely { | |
603 | my ($a, $b) = @_; | |
604 | $a =~ s/[\s\-_]//g; | |
605 | $b =~ s/[\s\-_]//g; | |
606 | $a =~ /^$b$/i; | |
607 | } | |
608 | ||
609 | #---------------------------------------------------------------------- | |
610 | # Merge PropertyValueAliases.txt data into the uchar.h hash. All | |
611 | # properties other than blk, _bp, and _ep are analyzed and mapped to | |
612 | # the names listed in PropertyValueAliases. They are then replaced | |
613 | # with a string of the form "<short>|<long>". The short or long name | |
614 | # may be missing. | |
615 | # | |
616 | # @param a hash ref from read_uchar. | |
617 | # @param a hash ref from read_PropertyValueAliases. | |
618 | sub merge_PropertyValueAliases { | |
619 | my ($h, $va) = @_; | |
620 | ||
621 | my %gcCount; | |
622 | for my $prop (keys %$h) { | |
623 | # _bp, _ep handled in merge_PropertyAliases | |
624 | next if ($prop =~ /^_/); | |
625 | ||
626 | # Special case: gcm | |
627 | my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop; | |
628 | ||
629 | # find corresponding PropertyValueAliases data | |
630 | die "Error: Can't find $prop in PropertyValueAliases.txt" | |
631 | unless (exists $va->{$prop2}); | |
632 | my $pva = $va->{$prop2}; | |
633 | ||
634 | # match up data | |
635 | my $hh = $h->{$prop}; | |
636 | for my $enum (keys %$hh) { | |
637 | ||
638 | my $name = $hh->{$enum}; | |
639 | ||
640 | # look up both long and short & ignore case | |
641 | my $n; | |
642 | if (exists $pva->{$name}) { | |
643 | $n = $name; | |
644 | } else { | |
645 | # iterate (slow) | |
646 | for my $a (keys %$pva) { | |
647 | # case-insensitive match | |
648 | # & case-insensitive reverse match | |
649 | if ($a =~ /^$name$/i || | |
650 | $pva->{$a} =~ /^$name$/i) { | |
651 | $n = $a; | |
652 | last; | |
653 | } | |
654 | } | |
655 | } | |
656 | ||
657 | # For blocks, do a loose match from Blocks.txt pseudo-name | |
658 | # to PropertyValueAliases long name. | |
659 | if (!$n && $prop eq 'blk') { | |
660 | for my $a (keys %$pva) { | |
661 | # The block is only going to match the long name, | |
662 | # but we check both for completeness. As of Unicode | |
663 | # 4.0, blocks do not have short names. | |
664 | if (matchesLoosely($name, $pva->{$a}) || | |
665 | matchesLoosely($name, $a)) { | |
666 | $n = $a; | |
667 | last; | |
668 | } | |
669 | } | |
670 | } | |
671 | ||
672 | die "Error: Property value $prop:$name not found" unless ($n); | |
673 | ||
674 | my $l = $n; | |
675 | my $r = $pva->{$n}; | |
676 | # convert |n/a\d+| to blank | |
677 | $l = '' if ($l =~ m|^n/a\d+$|); | |
678 | $r = '' if ($r =~ m|^n/a\d+$|); | |
679 | ||
680 | $hh->{$enum} = "$l|$r"; | |
681 | # Don't delete the 'gc' properties because we need to share | |
682 | # them between 'gc' and 'gcm'. Count each use instead. | |
683 | if ($prop2 eq 'gc') { | |
684 | ++$gcCount{$n}; | |
685 | } else { | |
686 | delete $pva->{$n}; | |
687 | } | |
688 | } | |
689 | } | |
690 | ||
691 | # Merge the combining class values in manually | |
692 | die "Error: No ccc data" | |
693 | unless exists $va->{'ccc'}; | |
694 | for my $ccc (keys %{$va->{'ccc'}}) { | |
695 | die "Error: Can't overwrite ccc $ccc" | |
696 | if (exists $h->{'ccc'}->{$ccc}); | |
697 | $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc}; | |
698 | } | |
699 | delete $va->{'ccc'}; | |
700 | ||
701 | # Merge synthetic binary property values in manually. | |
702 | # These are the "True" and "False" value aliases. | |
703 | die "Error: No True/False value aliases" | |
704 | unless exists $va->{'binprop'}; | |
705 | for my $bp (keys %{$va->{'binprop'}}) { | |
706 | $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp}; | |
707 | } | |
708 | delete $va->{'binprop'}; | |
709 | ||
710 | my $err = ''; | |
711 | for my $prop (sort keys %$va) { | |
712 | my $hh = $va->{$prop}; | |
713 | for my $subkey (sort keys %$hh) { | |
714 | # 'gc' props are shared with 'gcm'; make sure they were used | |
715 | # once or twice. | |
716 | if ($prop eq 'gc') { | |
717 | my $n = $gcCount{$subkey}; | |
718 | next if ($n >= 1 && $n <= 2); | |
719 | } | |
720 | $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n" | |
721 | unless exists $MISSING_FROM_UCHAR{$prop}; | |
722 | } | |
723 | } | |
724 | print $err if ($err); | |
725 | } | |
726 | ||
727 | #---------------------------------------------------------------------- | |
728 | # Read the PropertyAliases.txt file. Return a hash that maps the long | |
729 | # name to the short name. The special key '_version' will map to the | |
730 | # Unicode version of the file. The special key '_family' holds a | |
731 | # subhash that maps long names to a family string, for descriptive | |
732 | # purposes. | |
733 | # | |
734 | # @param a filename for PropertyAliases.txt | |
735 | # @param reference to hash to receive data. Keys are long names. | |
736 | # Values are short names. | |
737 | sub read_PropertyAliases { | |
738 | ||
739 | my $hash = shift; # result | |
740 | ||
741 | my $filename = shift; | |
742 | ||
743 | my $fam = {}; # map long names to family string | |
744 | $fam = $hash->{'_family'} if (exists $hash->{'_family'}); | |
745 | ||
746 | my $family; # binary, enumerated, etc. | |
747 | ||
748 | my $in = new FileHandle($filename, 'r'); | |
749 | die "Error: Cannot open $filename" if (!defined $in); | |
750 | ||
751 | while (<$in>) { | |
752 | ||
753 | # Read version (embedded in a comment) | |
754 | if (/PropertyAliases-(\d+\.\d+\.\d+)/i) { | |
755 | die "Error: Multiple versions in $filename" | |
756 | if (exists $hash->{'_version'}); | |
757 | $hash->{'_version'} = $1; | |
758 | } | |
759 | ||
760 | # Read family heading | |
761 | if (/^\s*\#\s*(.+?)\s*Properties\s*$/) { | |
762 | $family = $1; | |
763 | } | |
764 | ||
765 | # Ignore comments and blank lines | |
766 | s/\#.*//; | |
767 | next unless (/\S/); | |
768 | ||
769 | if (/^\s*(.+?)\s*;\s*(.+?)\s*$/i) { | |
770 | die "Error: Duplicate property $1 in $filename" | |
771 | if (exists $hash->{$2}); | |
772 | $hash->{$2} = $1; | |
773 | $fam->{$2} = $family; | |
774 | } | |
775 | ||
776 | else { | |
777 | die "Error: Can't parse $_ in $filename"; | |
778 | } | |
779 | } | |
780 | ||
781 | $in->close(); | |
782 | ||
783 | $hash->{'_family'} = $fam; | |
784 | } | |
785 | ||
786 | #---------------------------------------------------------------------- | |
787 | # Read the PropertyValueAliases.txt file. Return a two level hash | |
788 | # that maps property_short_name:value_short_name:value_long_name. In | |
789 | # the case of the 'ccc' property, the short name is the numeric class | |
790 | # and the long name is "<short>|<long>". The special key '_version' | |
791 | # will map to the Unicode version of the file. | |
792 | # | |
793 | # @param a filename for PropertyValueAliases.txt | |
794 | # | |
795 | # @return a hash reference. | |
796 | sub read_PropertyValueAliases { | |
797 | ||
798 | my $filename = shift; | |
799 | ||
800 | my $hash = {}; # result | |
801 | ||
802 | my $in = new FileHandle($filename, 'r'); | |
803 | die "Error: Cannot open $filename" if (!defined $in); | |
804 | ||
805 | my $sym = 0; # Used to make "n/a" strings unique | |
806 | ||
807 | while (<$in>) { | |
808 | ||
809 | # Read version (embedded in a comment) | |
810 | if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) { | |
811 | die "Error: Multiple versions in $filename" | |
812 | if (exists $hash->{'_version'}); | |
813 | $hash->{'_version'} = $1; | |
814 | } | |
815 | ||
816 | # Ignore comments and blank lines | |
817 | s/\#.*//; | |
818 | next unless (/\S/); | |
819 | ||
820 | if (/^\s*(.+?)\s*;/i) { | |
821 | my $prop = $1; | |
822 | my @fields = /;\s*([^\s;]+)/g; | |
823 | die "Error: Wrong number of fields" | |
824 | if (@fields < 2 || @fields > 3); | |
825 | # Make "n/a" strings unique | |
826 | $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a'); | |
827 | # Squash extra fields together | |
828 | while (@fields > 2) { | |
829 | my $f = pop @fields; | |
830 | $fields[$#fields] .= '|' . $f; | |
831 | } | |
832 | addDatum($hash, $prop, @fields); | |
833 | } | |
834 | ||
835 | else { | |
836 | die "Error: Can't parse $_ in $filename"; | |
837 | } | |
838 | } | |
839 | ||
840 | $in->close(); | |
841 | ||
842 | # Script Qaac (Coptic) is a special case. Handle it here. See UTR#24: | |
843 | # http://www.unicode.org/unicode/reports/tr24/ | |
844 | $hash->{'sc'}->{'Qaac'} = 'Coptic' | |
845 | unless (exists $hash->{'sc'}->{'Qaac'}); | |
846 | ||
847 | # Add T|True and F|False -- these are values we recognize for | |
848 | # binary properties (NOT from PropertyValueAliases.txt). These | |
849 | # are of the same form as the 'ccc' value aliases. | |
850 | $hash->{'binprop'}->{'0'} = 'F|False'; | |
851 | $hash->{'binprop'}->{'1'} = 'T|True'; | |
852 | ||
853 | $hash; | |
854 | } | |
855 | ||
856 | #---------------------------------------------------------------------- | |
857 | # Read the Blocks.txt file. Return a hash that maps the code point | |
858 | # range start to the block name. The special key '_version' will map | |
859 | # to the Unicode version of the file. | |
860 | # | |
861 | # As of Unicode 4.0, the names in the Blocks.txt are no longer the | |
862 | # proper names. The proper names are now listed in PropertyValueAliases. | |
863 | # They are similar but not identical. Furthermore, 4.0 introduces | |
864 | # a new block name, No_Block, which is listed only in PropertyValueAliases | |
865 | # and not in Blocks.txt. As a result, we handle blocks as follows: | |
866 | # | |
867 | # 1. Read Blocks.txt to map code point range start to quasi-block name. | |
868 | # 2. Add to Blocks.txt a synthetic No Block code point & name: | |
869 | # X -> No Block | |
870 | # 3. Map quasi-names from Blocks.txt (including No Block) to actual | |
871 | # names from PropertyValueAliases. This occurs in | |
872 | # merge_PropertyValueAliases. | |
873 | # | |
874 | # @param a filename for Blocks.txt | |
875 | # | |
876 | # @return a ref to a hash. Keys are code points, as text, e.g., | |
877 | # "1720". Values are pseudo-block names, e.g., "Hanunoo". | |
878 | sub read_Blocks { | |
879 | ||
880 | my $filename = shift; | |
881 | ||
882 | my $hash = {}; # result | |
883 | ||
884 | my $in = new FileHandle($filename, 'r'); | |
885 | die "Error: Cannot open $filename" if (!defined $in); | |
886 | ||
887 | while (<$in>) { | |
888 | ||
889 | # Read version (embedded in a comment) | |
890 | if (/Blocks-(\d+\.\d+\.\d+)/i) { | |
891 | die "Error: Multiple versions in $filename" | |
892 | if (exists $hash->{'_version'}); | |
893 | $hash->{'_version'} = $1; | |
894 | } | |
895 | ||
896 | # Ignore comments and blank lines | |
897 | s/\#.*//; | |
898 | next unless (/\S/); | |
899 | ||
900 | if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) { | |
901 | die "Error: Duplicate range $1 in $filename" | |
902 | if (exists $hash->{$1}); | |
903 | $hash->{$1} = $2; | |
904 | } | |
905 | ||
906 | else { | |
907 | die "Error: Can't parse $_ in $filename"; | |
908 | } | |
909 | } | |
910 | ||
911 | $in->close(); | |
912 | ||
913 | # Add pseudo-name for No Block | |
914 | $hash->{'none'} = 'No Block'; | |
915 | ||
916 | $hash; | |
917 | } | |
918 | ||
919 | #---------------------------------------------------------------------- | |
920 | # Read the uscript.h file and compile a mapping of Unicode symbols to | |
921 | # icu4c enum values. | |
922 | # | |
923 | # @param a filename for uscript.h | |
924 | # | |
925 | # @return a ref to a hash. The keys of the hash are enum symbols from | |
926 | # uscript.h, and the values are script names. | |
927 | sub read_uscript { | |
928 | ||
929 | my $filename = shift; | |
930 | ||
931 | my $mode = ''; # state machine mode and submode | |
932 | my $submode = ''; | |
933 | ||
934 | my $last = ''; # for line folding | |
935 | ||
936 | my $hash = {}; # result | |
937 | my $key; # first-level key | |
938 | ||
939 | my $in = new FileHandle($filename, 'r'); | |
940 | die "Error: Cannot open $filename" if (!defined $in); | |
941 | ||
942 | while (<$in>) { | |
943 | # Fold continued lines together | |
944 | if (/^(.*)\\$/) { | |
945 | $last = $1; | |
946 | next; | |
947 | } elsif ($last) { | |
948 | $_ = $last . $_; | |
949 | $last = ''; | |
950 | } | |
951 | ||
952 | # Exit all modes here | |
953 | if ($mode && $mode ne 'DEPRECATED') { | |
954 | if (/^\s*\}/) { | |
955 | $mode = ''; | |
956 | next; | |
957 | } | |
958 | } | |
959 | ||
960 | # Handle individual modes | |
961 | ||
962 | if ($mode eq 'UScriptCode') { | |
963 | if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) { | |
964 | my ($enum, $code) = ($1, $2); | |
965 | die "Error: Duplicate script $enum" | |
966 | if (exists $hash->{$enum}); | |
967 | $hash->{$enum} = $code; | |
968 | } | |
969 | } | |
970 | ||
971 | elsif ($mode eq 'DEPRECATED') { | |
972 | if (/\s*\#ifdef/) { | |
973 | die "Error: Nested #ifdef"; | |
974 | } | |
975 | elsif (/\s*\#endif/) { | |
976 | $mode = ''; | |
977 | } | |
978 | } | |
979 | ||
980 | elsif (!$mode) { | |
981 | if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || | |
982 | /^\s*typedef\s+enum\s+(\w+)\s*$/) { | |
983 | $mode = $1; | |
984 | #print "Parsing $mode\n"; | |
985 | } | |
986 | ||
987 | elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { | |
988 | $mode = 'DEPRECATED'; | |
989 | } | |
990 | } | |
991 | } | |
992 | ||
993 | $in->close(); | |
994 | ||
995 | $hash; | |
996 | } | |
997 | ||
998 | #---------------------------------------------------------------------- | |
999 | # Read the uchar.h file and compile a mapping of Unicode symbols to | |
1000 | # icu4c enum values. | |
1001 | # | |
1002 | # @param a filename for uchar.h | |
1003 | # | |
1004 | # @return a ref to a hash. The keys of the hash are '_bp' for binary | |
1005 | # properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for | |
1006 | # double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk', | |
1007 | # 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property | |
1008 | # value aliases. The values of the hash are subhashes. The subhashes | |
1009 | # have a key of the uchar.h enum symbol, and a value of the alias | |
1010 | # string (as listed in PropertyValueAliases.txt). NOTE: The alias | |
1011 | # string is whatever alias uchar.h lists. This may be either short or | |
1012 | # long, depending on the specific enum. NOTE: For blocks ('blk'), the | |
1013 | # value is a hex code point for the start of the associated block. | |
1014 | # NOTE: The special key _version will map to the Unicode version of | |
1015 | # the file. | |
1016 | sub read_uchar { | |
1017 | ||
1018 | my $filename = shift; | |
1019 | ||
1020 | my $mode = ''; # state machine mode and submode | |
1021 | my $submode = ''; | |
1022 | ||
1023 | my $last = ''; # for line folding | |
1024 | ||
1025 | my $hash = {}; # result | |
1026 | my $key; # first-level key | |
1027 | ||
1028 | my $in = new FileHandle($filename, 'r'); | |
1029 | die "Error: Cannot open $filename" if (!defined $in); | |
1030 | ||
1031 | while (<$in>) { | |
1032 | # Fold continued lines together | |
1033 | if (/^(.*)\\$/) { | |
1034 | $last .= $1; | |
1035 | next; | |
1036 | } elsif ($last) { | |
1037 | $_ = $last . $_; | |
1038 | $last = ''; | |
1039 | } | |
1040 | ||
1041 | # Exit all modes here | |
1042 | if ($mode && $mode ne 'DEPRECATED') { | |
1043 | if (/^\s*\}/) { | |
1044 | $mode = ''; | |
1045 | next; | |
1046 | } | |
1047 | } | |
1048 | ||
1049 | # Handle individual modes | |
1050 | ||
1051 | if ($mode eq 'UProperty') { | |
1052 | if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) { | |
1053 | if ($submode) { | |
1054 | addDatum($hash, $key, $1, $submode); | |
1055 | $submode = ''; | |
1056 | } else { | |
1057 | #print "Warning: Ignoring $1\n"; | |
1058 | } | |
1059 | } | |
1060 | ||
1061 | elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) { | |
1062 | die "Error: Unmatched tag $submode" if ($submode); | |
1063 | die "Error: Unrecognized UProperty comment: $_" | |
1064 | unless (exists $PROP_TYPE{$1}); | |
1065 | $key = $PROP_TYPE{$1}; | |
1066 | $submode = $2; | |
1067 | } | |
1068 | } | |
1069 | ||
1070 | elsif ($mode eq 'UCharCategory') { | |
1071 | if (/^\s*(U_\w+)\s*=/) { | |
1072 | if ($submode) { | |
1073 | addDatum($hash, 'gc', $1, $submode); | |
1074 | $submode = ''; | |
1075 | } else { | |
1076 | #print "Warning: Ignoring $1\n"; | |
1077 | } | |
1078 | } | |
1079 | ||
1080 | elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) { | |
1081 | die "Error: Unmatched tag $submode" if ($submode); | |
1082 | $submode = $1; | |
1083 | } | |
1084 | } | |
1085 | ||
1086 | elsif ($mode eq 'UCharDirection') { | |
1087 | if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) { | |
1088 | if ($submode) { | |
1089 | addDatum($hash, $key, $1, $submode); | |
1090 | $submode = ''; | |
1091 | } else { | |
1092 | #print "Warning: Ignoring $1\n"; | |
1093 | } | |
1094 | } | |
1095 | ||
1096 | elsif (m|/\*\*\s*([A-Z]+)\s|) { | |
1097 | die "Error: Unmatched tag $submode" if ($submode); | |
1098 | $key = 'bc'; | |
1099 | $submode = $1; | |
1100 | } | |
1101 | } | |
1102 | ||
1103 | elsif ($mode eq 'UBlockCode') { | |
1104 | if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) { | |
1105 | addDatum($hash, 'blk', $1, $2); | |
1106 | } | |
1107 | } | |
1108 | ||
1109 | elsif ($mode eq 'UEastAsianWidth') { | |
1110 | if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) { | |
1111 | addDatum($hash, 'ea', $1, $2); | |
1112 | } | |
1113 | } | |
1114 | ||
1115 | elsif ($mode eq 'UDecompositionType') { | |
1116 | if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) { | |
1117 | addDatum($hash, 'dt', $1, $2); | |
1118 | } | |
1119 | } | |
1120 | ||
1121 | elsif ($mode eq 'UJoiningType') { | |
1122 | if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) { | |
1123 | addDatum($hash, 'jt', $1, $2); | |
1124 | } | |
1125 | } | |
1126 | ||
1127 | elsif ($mode eq 'UJoiningGroup') { | |
1128 | if (/^\s*(U_JG_(\w+))/) { | |
1129 | addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT'); | |
1130 | } | |
1131 | } | |
1132 | ||
1133 | elsif ($mode eq 'ULineBreak') { | |
1134 | if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) { | |
1135 | addDatum($hash, 'lb', $1, $2); | |
1136 | } | |
1137 | } | |
1138 | ||
1139 | elsif ($mode eq 'UNumericType') { | |
1140 | if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) { | |
1141 | addDatum($hash, 'nt', $1, $2); | |
1142 | } | |
1143 | } | |
1144 | ||
1145 | elsif ($mode eq 'UHangulSyllableType') { | |
1146 | if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) { | |
1147 | addDatum($hash, 'hst', $1, $2); | |
1148 | } | |
1149 | } | |
1150 | ||
1151 | elsif ($mode eq 'DEPRECATED') { | |
1152 | if (/\s*\#ifdef/) { | |
1153 | die "Error: Nested #ifdef"; | |
1154 | } | |
1155 | elsif (/\s*\#endif/) { | |
1156 | $mode = ''; | |
1157 | } | |
1158 | } | |
1159 | ||
1160 | elsif (!$mode) { | |
1161 | if (/^\s*\#define\s+(\w+)\s+(.+)/) { | |
1162 | # #define $left $right | |
1163 | my ($left, $right) = ($1, $2); | |
1164 | ||
1165 | if ($left eq 'U_UNICODE_VERSION') { | |
1166 | my $version = $right; | |
1167 | $version = $1 if ($version =~ /^\"(.*)\"/); | |
1168 | # print "Unicode version: ", $version, "\n"; | |
1169 | die "Error: Multiple versions in $filename" | |
1170 | if (defined $hash->{'_version'}); | |
1171 | $hash->{'_version'} = $version; | |
1172 | } | |
1173 | ||
1174 | elsif ($left =~ /U_GC_(\w+?)_MASK/) { | |
1175 | addDatum($hash, 'gcm', $left, $1); | |
1176 | } | |
1177 | } | |
1178 | ||
1179 | elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || | |
1180 | /^\s*typedef\s+enum\s+(\w+)\s*$/) { | |
1181 | $mode = $1; | |
1182 | #print "Parsing $mode\n"; | |
1183 | } | |
1184 | ||
1185 | elsif (/^\s*enum\s+(\w+)\s*\{/ || | |
1186 | /^\s*enum\s+(\w+)\s*$/) { | |
1187 | $mode = $1; | |
1188 | #print "Parsing $mode\n"; | |
1189 | } | |
1190 | ||
1191 | elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { | |
1192 | $mode = 'DEPRECATED'; | |
1193 | } | |
1194 | } | |
1195 | } | |
1196 | ||
1197 | $in->close(); | |
1198 | ||
1199 | $hash; | |
1200 | } | |
1201 | ||
1202 | #---------------------------------------------------------------------- | |
1203 | # Add a new value to a two-level hash. That is, given a ref to | |
1204 | # a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value. | |
1205 | sub addDatum { | |
1206 | my ($h, $k1, $k2, $v) = @_; | |
1207 | if (exists $h->{$k1}->{$k2}) { | |
1208 | die "Error: $k1:$k2 already set to " . | |
1209 | $h->{$k1}->{$k2} . ", cannot set to " . $v; | |
1210 | } | |
1211 | $h->{$k1}->{$k2} = $v; | |
1212 | } | |
1213 | ||
1214 | #eof |