]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | #!/bin/perl -w |
2 | #******************************************************************* | |
3 | # COPYRIGHT: | |
374ca955 | 4 | # Copyright (c) 2002-2004, International Business Machines Corporation and |
b75a7d8f A |
5 | # others. All Rights Reserved. |
6 | #******************************************************************* | |
7 | ||
8 | # This script reads in UCD files PropertyAliases.txt and | |
9 | # PropertyValueAliases.txt and correlates them with ICU enums | |
10 | # defined in uchar.h and uscript.h. It then outputs a header | |
11 | # file which contains all names and enums. The header is included | |
12 | # by the genpname tool C++ source file, which produces the actual | |
13 | # binary data file. | |
14 | # | |
15 | # See usage note below. | |
16 | # | |
17 | # TODO: The Property[Value]Alias.txt files state that they can support | |
18 | # more than 2 names per property|value. Currently (Unicode 3.2) there | |
19 | # are always 1 or 2 names. If more names were supported, presumably | |
20 | # the format would be something like: | |
21 | # nv ; Numeric_Value | |
22 | # nv ; Value_Numerique | |
23 | # CURRENTLY, this script assumes that there are 1 or two names. Any | |
24 | # duplicates it sees are flagged as an error. If multiple aliases | |
25 | # appear in a future version of Unicode, modify this script to support | |
26 | # that. | |
27 | # | |
28 | # NOTE: As of ICU 2.6, this script has been modified to know about the | |
29 | # pseudo-property gcm/General_Category_Mask, which corresponds to the | |
30 | # uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property | |
31 | # corresponds to General_Category but is a bitmask value. It does not | |
32 | # exist in the UCD. Therefore, I special case it in several places | |
33 | # (search for General_Category_Mask and gcm). | |
34 | # | |
35 | # NOTE: As of ICU 2.6, this script reads an auxiliary data file, | |
36 | # SyntheticPropertyAliases.txt, containing property aliases not | |
37 | # present in the UCD but present in ICU. This file resides in the | |
38 | # same directory as this script. Its contents are merged into those | |
39 | # of PropertyAliases.txt as if the two files were appended. | |
40 | # | |
41 | # NOTE: The following names are handled specially. See script below | |
42 | # for details. | |
43 | # | |
44 | # T/True | |
45 | # F/False | |
46 | # No_Block | |
47 | # | |
48 | # Author: Alan Liu | |
49 | # Created: October 14 2002 | |
50 | # Since: ICU 2.4 | |
51 | ||
52 | use FileHandle; | |
53 | use strict; | |
54 | use Dumpvalue; | |
55 | ||
374ca955 | 56 | my $DEBUG = 1; |
b75a7d8f A |
57 | my $DUMPER = new Dumpvalue; |
58 | ||
59 | my $count = @ARGV; | |
60 | my $ICU_DIR = shift() || ''; | |
61 | my $OUT_FILE = shift() || 'data.h'; | |
62 | my $HEADER_DIR = "$ICU_DIR/source/common/unicode"; | |
63 | my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata"; | |
64 | ||
65 | # Get the current year from the system | |
66 | my $YEAR = 1900+@{[localtime]}[5]; # Get the current year | |
67 | ||
68 | #---------------------------------------------------------------------- | |
69 | # Top level property keys for binary, enumerated, string, and double props | |
70 | my @TOP = qw( _bp _ep _sp _dp _mp ); | |
71 | ||
72 | # This hash governs how top level properties are grouped into output arrays. | |
73 | #my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ], | |
74 | # "NO_VALUE" => [ '_sp', '_dp' ] );m | |
75 | #my %TOP_PROPS = ( "BINARY" => [ '_bp' ], | |
76 | # "ENUMERATED" => [ '_ep' ], | |
77 | # "STRING" => [ '_sp' ], | |
78 | # "DOUBLE" => [ '_dp' ] ); | |
79 | my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] ); | |
80 | ||
81 | my %PROP_TYPE = (Binary => "_bp", | |
82 | String => "_sp", | |
83 | Double => "_dp", | |
84 | Enumerated => "_ep", | |
85 | Bitmask => "_mp"); | |
86 | #---------------------------------------------------------------------- | |
87 | ||
88 | # Properties that are unsupported in ICU | |
89 | my %UNSUPPORTED = (Composition_Exclusion => 1, | |
90 | Decomposition_Mapping => 1, | |
91 | Expands_On_NFC => 1, | |
92 | Expands_On_NFD => 1, | |
93 | Expands_On_NFKC => 1, | |
94 | Expands_On_NFKD => 1, | |
95 | FC_NFKC_Closure => 1, | |
96 | ID_Start_Exceptions => 1, | |
b75a7d8f A |
97 | Special_Case_Condition => 1, |
98 | ); | |
99 | ||
100 | # Short names of properties that weren't seen in uchar.h. If the | |
101 | # properties weren't seen, don't complain about the property values | |
102 | # missing. | |
103 | my %MISSING_FROM_UCHAR; | |
104 | ||
105 | #---------------------------------------------------------------------- | |
106 | ||
107 | # Emitted class names | |
108 | my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property); | |
109 | ||
110 | if ($count < 1 || $count > 2 || | |
111 | !-d $HEADER_DIR || | |
112 | !-d $UNIDATA_DIR) { | |
113 | my $me = $0; | |
114 | $me =~ s|.+[/\\]||; | |
115 | my $lm = ' ' x length($me); | |
116 | print <<"END"; | |
117 | ||
118 | $me: Reads ICU4C headers and Unicode data files and creates | |
119 | $lm a C header file that is included by genpname. The header | |
120 | $lm file matches constants defined in the ICU4C headers with | |
121 | $lm property|value aliases in the Unicode data files. | |
122 | ||
123 | Usage: $me <icu_dir> [<out_file>] | |
124 | ||
125 | <icu_dir> ICU4C root directory, containing | |
126 | source/common/unicode/uchar.h | |
127 | source/common/unicode/uscript.h | |
128 | source/data/unidata/Blocks.txt | |
129 | source/data/unidata/PropertyAliases.txt | |
130 | source/data/unidata/PropertyValueAliases.txt | |
131 | <out_file> File name of header to be written; | |
132 | default is 'data.h'. | |
133 | ||
134 | The Unicode versions of all input files must match. | |
135 | END | |
136 | exit(1); | |
137 | } | |
138 | ||
139 | my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR); | |
140 | ||
141 | if ($DEBUG) { | |
142 | print "Merged hash:\n"; | |
143 | for my $key (sort keys %$h) { | |
144 | my $hh = $h->{$key}; | |
145 | for my $subkey (sort keys %$hh) { | |
146 | print "$key:$subkey:", $hh->{$subkey}, "\n"; | |
147 | } | |
148 | } | |
149 | } | |
150 | ||
151 | my $out = new FileHandle($OUT_FILE, 'w'); | |
152 | die "Error: Can't write to $OUT_FILE: $!" unless (defined $out); | |
153 | my $save = select($out); | |
154 | formatData($h, $version); | |
155 | select($save); | |
156 | $out->close(); | |
157 | ||
158 | exit(0); | |
159 | ||
160 | #---------------------------------------------------------------------- | |
161 | # From PropList.html: "The properties of the form Other_XXX | |
162 | # are used to generate properties in DerivedCoreProperties.txt. | |
163 | # They are not intended for general use, such as in APIs that | |
164 | # return property values. | |
165 | # Non_Break is not a valid property as of 3.2. | |
166 | sub isIgnoredProperty { | |
167 | local $_ = shift; | |
168 | /^Other_/i || /^Non_Break$/i; | |
169 | } | |
170 | ||
171 | # 'qc' is a pseudo-property matching any quick-check property | |
172 | # see PropertyValueAliases.txt file comments. 'binprop' is | |
173 | # a synthetic binary value alias "True"/"False", not present | |
174 | # in PropertyValueAliases.txt. | |
175 | sub isPseudoProperty { | |
176 | $_[0] eq 'qc' || | |
177 | $_[0] eq 'binprop'; | |
178 | } | |
179 | ||
180 | #---------------------------------------------------------------------- | |
181 | # Emit the combined data from headers and the Unicode database as a | |
182 | # C source code header file. | |
183 | # | |
184 | # @param ref to hash with the data | |
185 | # @param Unicode version, as a string | |
186 | sub formatData { | |
187 | my $h = shift; | |
188 | my $version = shift; | |
189 | ||
190 | my $date = scalar localtime(); | |
191 | print <<"END"; | |
192 | /** | |
193 | * Copyright (C) 2002-$YEAR, International Business Machines Corporation and | |
194 | * others. All Rights Reserved. | |
195 | * | |
196 | * MACHINE GENERATED FILE. !!! Do not edit manually !!! | |
197 | * | |
198 | * Generated from | |
199 | * uchar.h | |
200 | * uscript.h | |
201 | * Blocks.txt | |
202 | * PropertyAliases.txt | |
203 | * PropertyValueAliases.txt | |
204 | * | |
205 | * Date: $date | |
206 | * Unicode version: $version | |
207 | * Script: $0 | |
208 | */ | |
209 | ||
210 | END | |
211 | ||
212 | #------------------------------------------------------------ | |
213 | # Emit Unicode version | |
214 | print "/* Unicode version $version */\n"; | |
215 | my @v = split(/\./, $version); | |
216 | push @v, '0' while (@v < 4); | |
217 | for (my $i=0; $i<@v; ++$i) { | |
218 | print "const uint8_t VERSION_$i = $v[$i];\n"; | |
219 | } | |
220 | print "\n"; | |
221 | ||
222 | #------------------------------------------------------------ | |
223 | # Emit String table | |
224 | # [A table of all identifiers, that is, all long or short property | |
225 | # or value names. The list need NOT be sorted; it will be sorted | |
226 | # by the C program. Strings are referenced by their index into | |
227 | # this table. After sorting, a REMAP[] array is used to map the | |
228 | # old position indices to the new positions.] | |
229 | my %strings; | |
230 | for my $prop (sort keys %$h) { | |
231 | my $hh = $h->{$prop}; | |
232 | for my $enum (sort keys %$hh) { | |
233 | my @a = split(/\|/, $hh->{$enum}); | |
234 | for (@a) { | |
235 | $strings{$_} = 1 if (length($_)); | |
236 | } | |
237 | } | |
238 | } | |
239 | my @strings = sort keys %strings; | |
240 | unshift @strings, ""; | |
241 | ||
242 | print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n"; | |
243 | ||
244 | # while printing, create a mapping hash from string table entry to index | |
245 | my %stringToID; | |
246 | print "/* to be sorted */\n"; | |
247 | print "const $STRING_CLASS STRING_TABLE[] = {\n"; | |
248 | for (my $i=0; $i<@strings; ++$i) { | |
249 | print " $STRING_CLASS(\"$strings[$i]\", $i),\n"; | |
250 | $stringToID{$strings[$i]} = $i; | |
251 | } | |
252 | print "};\n\n"; | |
253 | ||
254 | # placeholder for the remapping index. this is used to map | |
255 | # indices that we compute here to indices of the sorted | |
256 | # STRING_TABLE. STRING_TABLE will be sorted by the C++ program | |
257 | # using the uprv_comparePropertyNames() function. this will | |
258 | # reshuffle the order. we then use the indices (passed to the | |
259 | # String constructor) to create a REMAP[] array. | |
260 | print "/* to be filled in */\n"; | |
261 | print "int32_t REMAP[", scalar @strings, "];\n\n"; | |
262 | ||
263 | #------------------------------------------------------------ | |
264 | # Emit the name group table | |
265 | # [A table of name groups. A name group is one or more names | |
266 | # for a property or property value. The Unicode data files specify | |
267 | # that there may be more than 2, although as of Unicode 3.2 there | |
268 | # are at most 2. The name group table looks like this: | |
269 | # | |
270 | # 114, -115, 116, -117, 0, -118, 65, -64, ... | |
271 | # [0] [2] [4] [6] | |
272 | # | |
273 | # The entry at [0] consists of 2 strings, 114 and 115. | |
274 | # The entry at [2] consists of 116 and 117. The entry at | |
275 | # [4] is one string, 118. There is always at least one | |
276 | # string; typically there are two. If there are two, the first | |
277 | # is the SHORT name and the second is the LONG. If there is | |
278 | # one, then the missing entry (always the short name, in 3.2) | |
279 | # is zero, which is by definition the index of "". The | |
280 | # 'preferred' name will generally be the LONG name, if there are | |
281 | # more than 2 entries. The last entry is negative. | |
282 | ||
283 | # Build name group list and replace string refs with nameGroup indices | |
284 | my @nameGroups; | |
285 | ||
286 | # Check for duplicate name groups, and reuse them if possible | |
287 | my %groupToInt; # Map group strings to ints | |
288 | for my $prop (sort keys %$h) { | |
289 | my $hh = $h->{$prop}; | |
290 | for my $enum (sort keys %$hh) { | |
291 | my $groupString = $hh->{$enum}; | |
292 | my $i; | |
293 | if (exists $groupToInt{$groupString}) { | |
294 | $i = $groupToInt{$groupString}; | |
295 | } else { | |
296 | my @names = split(/\|/, $groupString); | |
374ca955 | 297 | die "Error: Wrong number of names in " . $groupString if (@names < 2); |
b75a7d8f A |
298 | $i = @nameGroups; # index of group we are making |
299 | $groupToInt{$groupString} = $i; # Cache for reuse | |
300 | push @nameGroups, map { $stringToID{$_} } @names; | |
301 | $nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end | |
302 | } | |
303 | # now, replace string list with ref to name group | |
304 | $hh->{$enum} = $i; | |
305 | } | |
306 | } | |
307 | ||
308 | print "const int32_t NAME_GROUP_COUNT = ", | |
309 | scalar @nameGroups, ";\n\n"; | |
310 | ||
311 | print "int32_t NAME_GROUP[] = {\n"; | |
312 | # emit one group per line, with annotations | |
374ca955 | 313 | my $max_names = 0; |
b75a7d8f A |
314 | for (my $i=0; $i<@nameGroups; ) { |
315 | my @a; | |
316 | my $line; | |
317 | my $start = $i; | |
318 | for (;;) { | |
319 | my $j = $nameGroups[$i++]; | |
320 | $line .= "$j, "; | |
321 | push @a, abs($j); | |
322 | last if ($j < 0); | |
323 | } | |
324 | print " ", | |
325 | $line, | |
326 | ' 'x(20-length($line)), | |
327 | "/* ", sprintf("%3d", $start), | |
328 | ": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n"; | |
374ca955 A |
329 | $max_names = @a if(@a > $max_names); |
330 | ||
b75a7d8f A |
331 | } |
332 | print "};\n\n"; | |
333 | ||
334 | # This is fixed for 3.2 at "2" but should be calculated dynamically | |
335 | # when more than 2 names appear in Property[Value]Aliases.txt. | |
374ca955 | 336 | print "#define MAX_NAMES_PER_GROUP $max_names\n\n"; |
b75a7d8f A |
337 | |
338 | #------------------------------------------------------------ | |
339 | # Emit enumerated property values | |
340 | for my $prop (sort keys %$h) { | |
341 | next if ($prop =~ /^_/); | |
342 | my $vh = $h->{$prop}; | |
343 | my $count = scalar keys %$vh; | |
344 | ||
345 | print "const int32_t VALUES_${prop}_COUNT = ", | |
346 | $count, ";\n\n"; | |
347 | ||
348 | print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n"; | |
349 | for my $enum (sort keys %$vh) { | |
350 | #my @names = split(/\|/, $vh->{$enum}); | |
351 | #die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]" | |
352 | # if (@names != 2); | |
353 | print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n"; | |
354 | #$stringToID{$names[0]}, ", ", | |
355 | #$stringToID{$names[1]}, "),\n"; | |
356 | # "\"", $names[0], "\", ", | |
357 | # "\"", $names[1], "\"),\n"; | |
358 | } | |
359 | print "};\n\n"; | |
360 | } | |
361 | ||
362 | #------------------------------------------------------------ | |
363 | # Emit top-level properties (binary, enumerated, etc.) | |
364 | for my $topName (sort keys %TOP_PROPS) { | |
365 | my $a = $TOP_PROPS{$topName}; | |
366 | my $count = 0; | |
367 | for my $type (@$a) { # "_bp", "_ep", etc. | |
368 | $count += scalar keys %{$h->{$type}}; | |
369 | } | |
370 | ||
371 | print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n"; | |
372 | ||
373 | print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n"; | |
374 | ||
375 | for my $type (@$a) { # "_bp", "_ep", etc. | |
376 | my $p = $h->{$type}; | |
377 | ||
378 | for my $enum (sort keys %$p) { | |
379 | my $name = $strings[$nameGroups[$p->{$enum}]]; | |
380 | ||
381 | my $valueRef = "0, NULL"; | |
382 | if ($type eq '_bp') { | |
383 | $valueRef = "VALUES_binprop_COUNT, VALUES_binprop"; | |
384 | } | |
385 | elsif (exists $h->{$name}) { | |
386 | $valueRef = "VALUES_${name}_COUNT, VALUES_$name"; | |
387 | } | |
388 | ||
389 | print " $PROPERTY_CLASS((int32_t) $enum, ", | |
390 | $p->{$enum}, ", $valueRef),\n"; | |
391 | } | |
392 | } | |
393 | print "};\n\n"; | |
394 | } | |
395 | ||
396 | print "/*eof*/\n"; | |
397 | } | |
398 | ||
399 | #---------------------------------------------------------------------- | |
400 | # Read in the files uchar.h, uscript.h, Blocks.txt, | |
401 | # PropertyAliases.txt, and PropertyValueAliases.txt, | |
402 | # and combine them into one hash. | |
403 | # | |
404 | # @param directory containing headers | |
405 | # @param directory containin Unicode data files | |
406 | # | |
407 | # @return hash ref, Unicode version | |
408 | sub readAndMerge { | |
409 | ||
410 | my ($headerDir, $unidataDir) = @_; | |
411 | ||
412 | my $h = read_uchar("$headerDir/uchar.h"); | |
413 | my $s = read_uscript("$headerDir/uscript.h"); | |
414 | my $b = read_Blocks("$unidataDir/Blocks.txt"); | |
415 | my $pa = {}; | |
416 | read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt"); | |
417 | read_PropertyAliases($pa, "SyntheticPropertyAliases.txt"); | |
418 | my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt"); | |
419 | ||
420 | # Extract property family hash | |
421 | my $fam = $pa->{'_family'}; | |
422 | delete $pa->{'_family'}; | |
423 | ||
424 | # Note: uscript.h has no version string, so don't check it | |
425 | my $version = check_versions([ 'uchar.h', $h ], | |
426 | [ 'Blocks.txt', $b ], | |
427 | [ 'PropertyAliases.txt', $pa ], | |
428 | [ 'PropertyValueAliases.txt', $va ]); | |
429 | ||
430 | # Do this BEFORE merging; merging modifies the hashes | |
431 | check_PropertyValueAliases($pa, $va); | |
432 | ||
433 | # Dump out the $va hash for debugging | |
434 | if ($DEBUG) { | |
435 | print "Property values hash:\n"; | |
436 | for my $key (sort keys %$va) { | |
437 | my $hh = $va->{$key}; | |
438 | for my $subkey (sort keys %$hh) { | |
439 | print "$key:$subkey:", $hh->{$subkey}, "\n"; | |
440 | } | |
441 | } | |
442 | } | |
443 | ||
444 | # Dump out the $s hash for debugging | |
445 | if ($DEBUG) { | |
446 | print "Script hash:\n"; | |
447 | for my $key (sort keys %$s) { | |
448 | print "$key:", $s->{$key}, "\n"; | |
449 | } | |
450 | } | |
451 | ||
452 | # Link in the script data | |
453 | $h->{'sc'} = $s; | |
454 | ||
455 | merge_Blocks($h, $b); | |
456 | ||
457 | merge_PropertyAliases($h, $pa, $fam); | |
458 | ||
459 | merge_PropertyValueAliases($h, $va); | |
460 | ||
461 | ($h, $version); | |
462 | } | |
463 | ||
464 | #---------------------------------------------------------------------- | |
465 | # Ensure that the version strings in the given hashes (under the key | |
466 | # '_version') are compatible. Currently this means they must be | |
467 | # identical, with the exception that "X.Y" will match "X.Y.0". | |
468 | # All hashes must define the key '_version'. | |
469 | # | |
470 | # @param a list of pairs of (file name, hash reference) | |
471 | # | |
472 | # @return the version of all the hashes. Upon return, the '_version' | |
473 | # will be removed from all hashes. | |
474 | sub check_versions { | |
475 | my $version = ''; | |
476 | my $msg = ''; | |
477 | foreach my $a (@_) { | |
478 | my $name = $a->[0]; | |
479 | my $h = $a->[1]; | |
480 | die "Error: No version found" unless (exists $h->{'_version'}); | |
481 | my $v = $h->{'_version'}; | |
482 | delete $h->{'_version'}; | |
483 | ||
484 | # append ".0" if necessary, to standardize to X.Y.Z | |
485 | $v .= '.0' unless ($v =~ /\.\d+\./); | |
486 | $v .= '.0' unless ($v =~ /\.\d+\./); | |
487 | $msg .= "$name = $v\n"; | |
488 | if ($version) { | |
489 | die "Error: Mismatched Unicode versions\n$msg" | |
490 | unless ($version eq $v); | |
491 | } else { | |
492 | $version = $v; | |
493 | } | |
494 | } | |
495 | $version; | |
496 | } | |
497 | ||
498 | #---------------------------------------------------------------------- | |
499 | # Make sure the property names in PropertyValueAliases.txt match those | |
500 | # in PropertyAliases.txt. | |
501 | # | |
502 | # @param a hash ref from read_PropertyAliases. | |
503 | # @param a hash ref from read_PropertyValueAliases. | |
504 | sub check_PropertyValueAliases { | |
505 | my ($pa, $va) = @_; | |
506 | ||
507 | # make a reverse hash of short->long | |
508 | my %rev; | |
509 | for (keys %$pa) { $rev{$pa->{$_}} = $_; } | |
510 | ||
511 | for my $prop (keys %$va) { | |
512 | if (!exists $rev{$prop} && !isPseudoProperty($prop)) { | |
513 | print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n"; | |
514 | } | |
515 | } | |
516 | } | |
517 | ||
518 | #---------------------------------------------------------------------- | |
519 | # Merge blocks data into uchar.h enum data. In the 'blk' subhash all | |
520 | # code point values, as returned from read_uchar, are replaced by | |
521 | # block names, as read from Blocks.txt and returned by read_Blocks. | |
522 | # The match must be 1-to-1. If there is any failure of 1-to-1 | |
523 | # mapping, an error is signaled. Upon return, the read_Blocks hash | |
524 | # is emptied of all contents, except for those that failed to match. | |
525 | # | |
526 | # The mapping in the 'blk' subhash, after this function returns, is | |
527 | # from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h | |
528 | # pseudo-name, e.g. "Basic Latin". | |
529 | # | |
530 | # @param a hash ref from read_uchar. | |
531 | # @param a hash ref from read_Blocks. | |
532 | sub merge_Blocks { | |
533 | my ($h, $b) = @_; | |
534 | ||
535 | die "Error: No blocks data in uchar.h" | |
536 | unless (exists $h->{'blk'}); | |
537 | my $blk = $h->{'blk'}; | |
538 | for my $enum (keys %$blk) { | |
539 | my $cp = $blk->{$enum}; | |
540 | if ($cp && !exists $b->{$cp}) { | |
541 | die "Error: No block found at $cp in Blocks.txt"; | |
542 | } | |
543 | # Convert code point to pseudo-name: | |
544 | $blk->{$enum} = $b->{$cp}; | |
545 | delete $b->{$cp}; | |
546 | } | |
547 | my $err = ''; | |
548 | for my $cp (keys %$b) { | |
549 | $err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n"; | |
550 | } | |
551 | die $err if ($err); | |
552 | } | |
553 | ||
554 | #---------------------------------------------------------------------- | |
555 | # Merge property alias names into the uchar.h hash. The subhashes | |
556 | # under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are | |
557 | # examined and the values of those subhashes are assumed to be long | |
558 | # names in PropertyAliases.txt. They are validated and replaced by | |
559 | # "<short>|<long>". Upon return, the read_PropertyAliases hash is | |
560 | # emptied of all contents, except for those that failed to match. | |
561 | # Unmatched names in PropertyAliases are listed as a warning but do | |
562 | # NOT cause the script to die. | |
563 | # | |
564 | # @param a hash ref from read_uchar. | |
565 | # @param a hash ref from read_PropertyAliases. | |
566 | # @param a hash mapping long names to property family (e.g., 'binary') | |
567 | sub merge_PropertyAliases { | |
568 | my ($h, $pa, $fam) = @_; | |
569 | ||
570 | for my $k (@TOP) { | |
571 | die "Error: No properties data for $k in uchar.h" | |
572 | unless (exists $h->{$k}); | |
573 | } | |
574 | ||
575 | for my $subh (map { $h->{$_} } @TOP) { | |
576 | for my $enum (keys %$subh) { | |
577 | my $name = $subh->{$enum}; | |
578 | die "Error: Property $name not found (or used more than once)" | |
579 | unless (exists $pa->{$name}); | |
580 | ||
581 | $subh->{$enum} = $pa->{$name} . "|" . $name; | |
582 | delete $pa->{$name}; | |
583 | } | |
584 | } | |
585 | my @err; | |
586 | for my $name (keys %$pa) { | |
587 | $MISSING_FROM_UCHAR{$pa->{$name}} = 1; | |
588 | if (exists $UNSUPPORTED{$name}) { | |
589 | push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h"; | |
590 | } elsif (!isIgnoredProperty($name)) { | |
591 | push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h"; | |
592 | } | |
593 | } | |
594 | print join("\n", sort @err), "\n" if (@err); | |
595 | } | |
596 | ||
597 | #---------------------------------------------------------------------- | |
598 | # Return 1 if two names match ignoring whitespace, '-', and '_'. | |
599 | # Used to match names in Blocks.txt with those in PropertyValueAliases.txt | |
600 | # as of Unicode 4.0. | |
601 | sub matchesLoosely { | |
602 | my ($a, $b) = @_; | |
603 | $a =~ s/[\s\-_]//g; | |
604 | $b =~ s/[\s\-_]//g; | |
605 | $a =~ /^$b$/i; | |
606 | } | |
607 | ||
608 | #---------------------------------------------------------------------- | |
609 | # Merge PropertyValueAliases.txt data into the uchar.h hash. All | |
610 | # properties other than blk, _bp, and _ep are analyzed and mapped to | |
611 | # the names listed in PropertyValueAliases. They are then replaced | |
612 | # with a string of the form "<short>|<long>". The short or long name | |
613 | # may be missing. | |
614 | # | |
615 | # @param a hash ref from read_uchar. | |
616 | # @param a hash ref from read_PropertyValueAliases. | |
617 | sub merge_PropertyValueAliases { | |
618 | my ($h, $va) = @_; | |
619 | ||
620 | my %gcCount; | |
621 | for my $prop (keys %$h) { | |
622 | # _bp, _ep handled in merge_PropertyAliases | |
623 | next if ($prop =~ /^_/); | |
624 | ||
625 | # Special case: gcm | |
626 | my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop; | |
627 | ||
628 | # find corresponding PropertyValueAliases data | |
629 | die "Error: Can't find $prop in PropertyValueAliases.txt" | |
630 | unless (exists $va->{$prop2}); | |
631 | my $pva = $va->{$prop2}; | |
632 | ||
633 | # match up data | |
634 | my $hh = $h->{$prop}; | |
635 | for my $enum (keys %$hh) { | |
636 | ||
637 | my $name = $hh->{$enum}; | |
638 | ||
639 | # look up both long and short & ignore case | |
640 | my $n; | |
641 | if (exists $pva->{$name}) { | |
642 | $n = $name; | |
643 | } else { | |
644 | # iterate (slow) | |
645 | for my $a (keys %$pva) { | |
646 | # case-insensitive match | |
647 | # & case-insensitive reverse match | |
648 | if ($a =~ /^$name$/i || | |
649 | $pva->{$a} =~ /^$name$/i) { | |
650 | $n = $a; | |
651 | last; | |
652 | } | |
653 | } | |
654 | } | |
655 | ||
656 | # For blocks, do a loose match from Blocks.txt pseudo-name | |
657 | # to PropertyValueAliases long name. | |
658 | if (!$n && $prop eq 'blk') { | |
659 | for my $a (keys %$pva) { | |
660 | # The block is only going to match the long name, | |
661 | # but we check both for completeness. As of Unicode | |
662 | # 4.0, blocks do not have short names. | |
663 | if (matchesLoosely($name, $pva->{$a}) || | |
664 | matchesLoosely($name, $a)) { | |
665 | $n = $a; | |
666 | last; | |
667 | } | |
668 | } | |
669 | } | |
670 | ||
671 | die "Error: Property value $prop:$name not found" unless ($n); | |
672 | ||
673 | my $l = $n; | |
674 | my $r = $pva->{$n}; | |
675 | # convert |n/a\d+| to blank | |
676 | $l = '' if ($l =~ m|^n/a\d+$|); | |
677 | $r = '' if ($r =~ m|^n/a\d+$|); | |
678 | ||
679 | $hh->{$enum} = "$l|$r"; | |
680 | # Don't delete the 'gc' properties because we need to share | |
681 | # them between 'gc' and 'gcm'. Count each use instead. | |
682 | if ($prop2 eq 'gc') { | |
683 | ++$gcCount{$n}; | |
684 | } else { | |
685 | delete $pva->{$n}; | |
686 | } | |
687 | } | |
688 | } | |
689 | ||
690 | # Merge the combining class values in manually | |
374ca955 | 691 | # Add the same values to the synthetic lccc and tccc properties |
b75a7d8f A |
692 | die "Error: No ccc data" |
693 | unless exists $va->{'ccc'}; | |
694 | for my $ccc (keys %{$va->{'ccc'}}) { | |
695 | die "Error: Can't overwrite ccc $ccc" | |
696 | if (exists $h->{'ccc'}->{$ccc}); | |
374ca955 A |
697 | $h->{'lccc'}->{$ccc} = |
698 | $h->{'tccc'}->{$ccc} = | |
b75a7d8f A |
699 | $h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc}; |
700 | } | |
701 | delete $va->{'ccc'}; | |
702 | ||
703 | # Merge synthetic binary property values in manually. | |
704 | # These are the "True" and "False" value aliases. | |
705 | die "Error: No True/False value aliases" | |
706 | unless exists $va->{'binprop'}; | |
707 | for my $bp (keys %{$va->{'binprop'}}) { | |
708 | $h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp}; | |
709 | } | |
710 | delete $va->{'binprop'}; | |
711 | ||
712 | my $err = ''; | |
713 | for my $prop (sort keys %$va) { | |
714 | my $hh = $va->{$prop}; | |
715 | for my $subkey (sort keys %$hh) { | |
716 | # 'gc' props are shared with 'gcm'; make sure they were used | |
717 | # once or twice. | |
718 | if ($prop eq 'gc') { | |
719 | my $n = $gcCount{$subkey}; | |
720 | next if ($n >= 1 && $n <= 2); | |
721 | } | |
722 | $err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n" | |
723 | unless exists $MISSING_FROM_UCHAR{$prop}; | |
724 | } | |
725 | } | |
726 | print $err if ($err); | |
727 | } | |
728 | ||
729 | #---------------------------------------------------------------------- | |
730 | # Read the PropertyAliases.txt file. Return a hash that maps the long | |
731 | # name to the short name. The special key '_version' will map to the | |
732 | # Unicode version of the file. The special key '_family' holds a | |
733 | # subhash that maps long names to a family string, for descriptive | |
734 | # purposes. | |
735 | # | |
736 | # @param a filename for PropertyAliases.txt | |
737 | # @param reference to hash to receive data. Keys are long names. | |
738 | # Values are short names. | |
739 | sub read_PropertyAliases { | |
740 | ||
741 | my $hash = shift; # result | |
742 | ||
743 | my $filename = shift; | |
744 | ||
745 | my $fam = {}; # map long names to family string | |
746 | $fam = $hash->{'_family'} if (exists $hash->{'_family'}); | |
747 | ||
748 | my $family; # binary, enumerated, etc. | |
749 | ||
750 | my $in = new FileHandle($filename, 'r'); | |
751 | die "Error: Cannot open $filename" if (!defined $in); | |
752 | ||
753 | while (<$in>) { | |
754 | ||
755 | # Read version (embedded in a comment) | |
756 | if (/PropertyAliases-(\d+\.\d+\.\d+)/i) { | |
757 | die "Error: Multiple versions in $filename" | |
758 | if (exists $hash->{'_version'}); | |
759 | $hash->{'_version'} = $1; | |
760 | } | |
761 | ||
762 | # Read family heading | |
763 | if (/^\s*\#\s*(.+?)\s*Properties\s*$/) { | |
764 | $family = $1; | |
765 | } | |
766 | ||
767 | # Ignore comments and blank lines | |
768 | s/\#.*//; | |
769 | next unless (/\S/); | |
770 | ||
771 | if (/^\s*(.+?)\s*;\s*(.+?)\s*$/i) { | |
772 | die "Error: Duplicate property $1 in $filename" | |
773 | if (exists $hash->{$2}); | |
774 | $hash->{$2} = $1; | |
775 | $fam->{$2} = $family; | |
776 | } | |
777 | ||
778 | else { | |
779 | die "Error: Can't parse $_ in $filename"; | |
780 | } | |
781 | } | |
782 | ||
783 | $in->close(); | |
784 | ||
785 | $hash->{'_family'} = $fam; | |
786 | } | |
787 | ||
788 | #---------------------------------------------------------------------- | |
789 | # Read the PropertyValueAliases.txt file. Return a two level hash | |
790 | # that maps property_short_name:value_short_name:value_long_name. In | |
791 | # the case of the 'ccc' property, the short name is the numeric class | |
792 | # and the long name is "<short>|<long>". The special key '_version' | |
793 | # will map to the Unicode version of the file. | |
794 | # | |
795 | # @param a filename for PropertyValueAliases.txt | |
796 | # | |
797 | # @return a hash reference. | |
798 | sub read_PropertyValueAliases { | |
799 | ||
800 | my $filename = shift; | |
801 | ||
802 | my $hash = {}; # result | |
803 | ||
804 | my $in = new FileHandle($filename, 'r'); | |
805 | die "Error: Cannot open $filename" if (!defined $in); | |
806 | ||
807 | my $sym = 0; # Used to make "n/a" strings unique | |
808 | ||
809 | while (<$in>) { | |
810 | ||
811 | # Read version (embedded in a comment) | |
812 | if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) { | |
813 | die "Error: Multiple versions in $filename" | |
814 | if (exists $hash->{'_version'}); | |
815 | $hash->{'_version'} = $1; | |
816 | } | |
817 | ||
818 | # Ignore comments and blank lines | |
819 | s/\#.*//; | |
820 | next unless (/\S/); | |
821 | ||
822 | if (/^\s*(.+?)\s*;/i) { | |
823 | my $prop = $1; | |
824 | my @fields = /;\s*([^\s;]+)/g; | |
825 | die "Error: Wrong number of fields" | |
826 | if (@fields < 2 || @fields > 3); | |
827 | # Make "n/a" strings unique | |
828 | $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a'); | |
829 | # Squash extra fields together | |
830 | while (@fields > 2) { | |
831 | my $f = pop @fields; | |
832 | $fields[$#fields] .= '|' . $f; | |
833 | } | |
834 | addDatum($hash, $prop, @fields); | |
835 | } | |
836 | ||
837 | else { | |
838 | die "Error: Can't parse $_ in $filename"; | |
839 | } | |
840 | } | |
841 | ||
842 | $in->close(); | |
843 | ||
844 | # Script Qaac (Coptic) is a special case. Handle it here. See UTR#24: | |
845 | # http://www.unicode.org/unicode/reports/tr24/ | |
846 | $hash->{'sc'}->{'Qaac'} = 'Coptic' | |
847 | unless (exists $hash->{'sc'}->{'Qaac'}); | |
848 | ||
849 | # Add T|True and F|False -- these are values we recognize for | |
850 | # binary properties (NOT from PropertyValueAliases.txt). These | |
851 | # are of the same form as the 'ccc' value aliases. | |
852 | $hash->{'binprop'}->{'0'} = 'F|False'; | |
853 | $hash->{'binprop'}->{'1'} = 'T|True'; | |
854 | ||
855 | $hash; | |
856 | } | |
857 | ||
858 | #---------------------------------------------------------------------- | |
859 | # Read the Blocks.txt file. Return a hash that maps the code point | |
860 | # range start to the block name. The special key '_version' will map | |
861 | # to the Unicode version of the file. | |
862 | # | |
863 | # As of Unicode 4.0, the names in the Blocks.txt are no longer the | |
864 | # proper names. The proper names are now listed in PropertyValueAliases. | |
865 | # They are similar but not identical. Furthermore, 4.0 introduces | |
866 | # a new block name, No_Block, which is listed only in PropertyValueAliases | |
867 | # and not in Blocks.txt. As a result, we handle blocks as follows: | |
868 | # | |
869 | # 1. Read Blocks.txt to map code point range start to quasi-block name. | |
870 | # 2. Add to Blocks.txt a synthetic No Block code point & name: | |
871 | # X -> No Block | |
872 | # 3. Map quasi-names from Blocks.txt (including No Block) to actual | |
873 | # names from PropertyValueAliases. This occurs in | |
874 | # merge_PropertyValueAliases. | |
875 | # | |
876 | # @param a filename for Blocks.txt | |
877 | # | |
878 | # @return a ref to a hash. Keys are code points, as text, e.g., | |
879 | # "1720". Values are pseudo-block names, e.g., "Hanunoo". | |
880 | sub read_Blocks { | |
881 | ||
882 | my $filename = shift; | |
883 | ||
884 | my $hash = {}; # result | |
885 | ||
886 | my $in = new FileHandle($filename, 'r'); | |
887 | die "Error: Cannot open $filename" if (!defined $in); | |
888 | ||
889 | while (<$in>) { | |
890 | ||
891 | # Read version (embedded in a comment) | |
892 | if (/Blocks-(\d+\.\d+\.\d+)/i) { | |
893 | die "Error: Multiple versions in $filename" | |
894 | if (exists $hash->{'_version'}); | |
895 | $hash->{'_version'} = $1; | |
896 | } | |
897 | ||
898 | # Ignore comments and blank lines | |
899 | s/\#.*//; | |
900 | next unless (/\S/); | |
901 | ||
902 | if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) { | |
903 | die "Error: Duplicate range $1 in $filename" | |
904 | if (exists $hash->{$1}); | |
905 | $hash->{$1} = $2; | |
906 | } | |
907 | ||
908 | else { | |
909 | die "Error: Can't parse $_ in $filename"; | |
910 | } | |
911 | } | |
912 | ||
913 | $in->close(); | |
914 | ||
915 | # Add pseudo-name for No Block | |
916 | $hash->{'none'} = 'No Block'; | |
917 | ||
918 | $hash; | |
919 | } | |
920 | ||
921 | #---------------------------------------------------------------------- | |
922 | # Read the uscript.h file and compile a mapping of Unicode symbols to | |
923 | # icu4c enum values. | |
924 | # | |
925 | # @param a filename for uscript.h | |
926 | # | |
927 | # @return a ref to a hash. The keys of the hash are enum symbols from | |
928 | # uscript.h, and the values are script names. | |
929 | sub read_uscript { | |
930 | ||
931 | my $filename = shift; | |
932 | ||
933 | my $mode = ''; # state machine mode and submode | |
934 | my $submode = ''; | |
935 | ||
936 | my $last = ''; # for line folding | |
937 | ||
938 | my $hash = {}; # result | |
939 | my $key; # first-level key | |
940 | ||
941 | my $in = new FileHandle($filename, 'r'); | |
942 | die "Error: Cannot open $filename" if (!defined $in); | |
943 | ||
944 | while (<$in>) { | |
945 | # Fold continued lines together | |
946 | if (/^(.*)\\$/) { | |
947 | $last = $1; | |
948 | next; | |
949 | } elsif ($last) { | |
950 | $_ = $last . $_; | |
951 | $last = ''; | |
952 | } | |
953 | ||
954 | # Exit all modes here | |
955 | if ($mode && $mode ne 'DEPRECATED') { | |
956 | if (/^\s*\}/) { | |
957 | $mode = ''; | |
958 | next; | |
959 | } | |
960 | } | |
961 | ||
962 | # Handle individual modes | |
963 | ||
964 | if ($mode eq 'UScriptCode') { | |
965 | if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) { | |
966 | my ($enum, $code) = ($1, $2); | |
967 | die "Error: Duplicate script $enum" | |
968 | if (exists $hash->{$enum}); | |
969 | $hash->{$enum} = $code; | |
970 | } | |
971 | } | |
972 | ||
973 | elsif ($mode eq 'DEPRECATED') { | |
974 | if (/\s*\#ifdef/) { | |
975 | die "Error: Nested #ifdef"; | |
976 | } | |
977 | elsif (/\s*\#endif/) { | |
978 | $mode = ''; | |
979 | } | |
980 | } | |
981 | ||
982 | elsif (!$mode) { | |
983 | if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || | |
984 | /^\s*typedef\s+enum\s+(\w+)\s*$/) { | |
985 | $mode = $1; | |
986 | #print "Parsing $mode\n"; | |
987 | } | |
988 | ||
989 | elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { | |
990 | $mode = 'DEPRECATED'; | |
991 | } | |
992 | } | |
993 | } | |
994 | ||
995 | $in->close(); | |
996 | ||
997 | $hash; | |
998 | } | |
999 | ||
1000 | #---------------------------------------------------------------------- | |
1001 | # Read the uchar.h file and compile a mapping of Unicode symbols to | |
1002 | # icu4c enum values. | |
1003 | # | |
1004 | # @param a filename for uchar.h | |
1005 | # | |
1006 | # @return a ref to a hash. The keys of the hash are '_bp' for binary | |
1007 | # properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for | |
1008 | # double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk', | |
1009 | # 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property | |
1010 | # value aliases. The values of the hash are subhashes. The subhashes | |
1011 | # have a key of the uchar.h enum symbol, and a value of the alias | |
1012 | # string (as listed in PropertyValueAliases.txt). NOTE: The alias | |
1013 | # string is whatever alias uchar.h lists. This may be either short or | |
1014 | # long, depending on the specific enum. NOTE: For blocks ('blk'), the | |
1015 | # value is a hex code point for the start of the associated block. | |
1016 | # NOTE: The special key _version will map to the Unicode version of | |
1017 | # the file. | |
1018 | sub read_uchar { | |
1019 | ||
1020 | my $filename = shift; | |
1021 | ||
1022 | my $mode = ''; # state machine mode and submode | |
1023 | my $submode = ''; | |
1024 | ||
1025 | my $last = ''; # for line folding | |
1026 | ||
1027 | my $hash = {}; # result | |
1028 | my $key; # first-level key | |
1029 | ||
1030 | my $in = new FileHandle($filename, 'r'); | |
1031 | die "Error: Cannot open $filename" if (!defined $in); | |
1032 | ||
1033 | while (<$in>) { | |
1034 | # Fold continued lines together | |
1035 | if (/^(.*)\\$/) { | |
1036 | $last .= $1; | |
1037 | next; | |
1038 | } elsif ($last) { | |
1039 | $_ = $last . $_; | |
1040 | $last = ''; | |
1041 | } | |
1042 | ||
1043 | # Exit all modes here | |
1044 | if ($mode && $mode ne 'DEPRECATED') { | |
1045 | if (/^\s*\}/) { | |
1046 | $mode = ''; | |
1047 | next; | |
1048 | } | |
1049 | } | |
1050 | ||
1051 | # Handle individual modes | |
1052 | ||
1053 | if ($mode eq 'UProperty') { | |
1054 | if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) { | |
1055 | if ($submode) { | |
1056 | addDatum($hash, $key, $1, $submode); | |
1057 | $submode = ''; | |
1058 | } else { | |
1059 | #print "Warning: Ignoring $1\n"; | |
1060 | } | |
1061 | } | |
1062 | ||
1063 | elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) { | |
1064 | die "Error: Unmatched tag $submode" if ($submode); | |
1065 | die "Error: Unrecognized UProperty comment: $_" | |
1066 | unless (exists $PROP_TYPE{$1}); | |
1067 | $key = $PROP_TYPE{$1}; | |
1068 | $submode = $2; | |
1069 | } | |
1070 | } | |
1071 | ||
1072 | elsif ($mode eq 'UCharCategory') { | |
1073 | if (/^\s*(U_\w+)\s*=/) { | |
1074 | if ($submode) { | |
1075 | addDatum($hash, 'gc', $1, $submode); | |
1076 | $submode = ''; | |
1077 | } else { | |
1078 | #print "Warning: Ignoring $1\n"; | |
1079 | } | |
1080 | } | |
1081 | ||
1082 | elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) { | |
1083 | die "Error: Unmatched tag $submode" if ($submode); | |
1084 | $submode = $1; | |
1085 | } | |
1086 | } | |
1087 | ||
1088 | elsif ($mode eq 'UCharDirection') { | |
1089 | if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) { | |
1090 | if ($submode) { | |
1091 | addDatum($hash, $key, $1, $submode); | |
1092 | $submode = ''; | |
1093 | } else { | |
1094 | #print "Warning: Ignoring $1\n"; | |
1095 | } | |
1096 | } | |
1097 | ||
1098 | elsif (m|/\*\*\s*([A-Z]+)\s|) { | |
1099 | die "Error: Unmatched tag $submode" if ($submode); | |
1100 | $key = 'bc'; | |
1101 | $submode = $1; | |
1102 | } | |
1103 | } | |
1104 | ||
1105 | elsif ($mode eq 'UBlockCode') { | |
1106 | if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) { | |
1107 | addDatum($hash, 'blk', $1, $2); | |
1108 | } | |
1109 | } | |
1110 | ||
1111 | elsif ($mode eq 'UEastAsianWidth') { | |
1112 | if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) { | |
1113 | addDatum($hash, 'ea', $1, $2); | |
1114 | } | |
1115 | } | |
1116 | ||
1117 | elsif ($mode eq 'UDecompositionType') { | |
1118 | if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) { | |
1119 | addDatum($hash, 'dt', $1, $2); | |
1120 | } | |
1121 | } | |
1122 | ||
1123 | elsif ($mode eq 'UJoiningType') { | |
1124 | if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) { | |
1125 | addDatum($hash, 'jt', $1, $2); | |
1126 | } | |
1127 | } | |
1128 | ||
1129 | elsif ($mode eq 'UJoiningGroup') { | |
1130 | if (/^\s*(U_JG_(\w+))/) { | |
1131 | addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT'); | |
1132 | } | |
1133 | } | |
1134 | ||
1135 | elsif ($mode eq 'ULineBreak') { | |
1136 | if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) { | |
1137 | addDatum($hash, 'lb', $1, $2); | |
1138 | } | |
1139 | } | |
1140 | ||
1141 | elsif ($mode eq 'UNumericType') { | |
1142 | if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) { | |
1143 | addDatum($hash, 'nt', $1, $2); | |
1144 | } | |
1145 | } | |
1146 | ||
1147 | elsif ($mode eq 'UHangulSyllableType') { | |
1148 | if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) { | |
1149 | addDatum($hash, 'hst', $1, $2); | |
1150 | } | |
1151 | } | |
1152 | ||
1153 | elsif ($mode eq 'DEPRECATED') { | |
1154 | if (/\s*\#ifdef/) { | |
1155 | die "Error: Nested #ifdef"; | |
1156 | } | |
1157 | elsif (/\s*\#endif/) { | |
1158 | $mode = ''; | |
1159 | } | |
1160 | } | |
1161 | ||
1162 | elsif (!$mode) { | |
1163 | if (/^\s*\#define\s+(\w+)\s+(.+)/) { | |
1164 | # #define $left $right | |
1165 | my ($left, $right) = ($1, $2); | |
1166 | ||
1167 | if ($left eq 'U_UNICODE_VERSION') { | |
1168 | my $version = $right; | |
1169 | $version = $1 if ($version =~ /^\"(.*)\"/); | |
1170 | # print "Unicode version: ", $version, "\n"; | |
1171 | die "Error: Multiple versions in $filename" | |
1172 | if (defined $hash->{'_version'}); | |
1173 | $hash->{'_version'} = $version; | |
1174 | } | |
1175 | ||
1176 | elsif ($left =~ /U_GC_(\w+?)_MASK/) { | |
1177 | addDatum($hash, 'gcm', $left, $1); | |
1178 | } | |
1179 | } | |
1180 | ||
1181 | elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ || | |
1182 | /^\s*typedef\s+enum\s+(\w+)\s*$/) { | |
1183 | $mode = $1; | |
1184 | #print "Parsing $mode\n"; | |
1185 | } | |
1186 | ||
1187 | elsif (/^\s*enum\s+(\w+)\s*\{/ || | |
1188 | /^\s*enum\s+(\w+)\s*$/) { | |
1189 | $mode = $1; | |
1190 | #print "Parsing $mode\n"; | |
1191 | } | |
1192 | ||
1193 | elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) { | |
1194 | $mode = 'DEPRECATED'; | |
1195 | } | |
1196 | } | |
1197 | } | |
1198 | ||
1199 | $in->close(); | |
1200 | ||
374ca955 A |
1201 | # hardcode known values for the normalization quick check properties |
1202 | # see unorm.h for the UNormalizationCheckResult enum | |
1203 | ||
1204 | addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N'); | |
1205 | addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y'); | |
1206 | addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M'); | |
1207 | ||
1208 | addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N'); | |
1209 | addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y'); | |
1210 | addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M'); | |
1211 | ||
1212 | # no "maybe" values for NF[K]D | |
1213 | ||
1214 | addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N'); | |
1215 | addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y'); | |
1216 | ||
1217 | addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N'); | |
1218 | addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y'); | |
1219 | ||
b75a7d8f A |
1220 | $hash; |
1221 | } | |
1222 | ||
1223 | #---------------------------------------------------------------------- | |
1224 | # Add a new value to a two-level hash. That is, given a ref to | |
1225 | # a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value. | |
1226 | sub addDatum { | |
1227 | my ($h, $k1, $k2, $v) = @_; | |
1228 | if (exists $h->{$k1}->{$k2}) { | |
1229 | die "Error: $k1:$k2 already set to " . | |
1230 | $h->{$k1}->{$k2} . ", cannot set to " . $v; | |
1231 | } | |
1232 | $h->{$k1}->{$k2} = $v; | |
1233 | } | |
1234 | ||
1235 | #eof |