git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/tools/genpname/preparse.pl

... / ...

Commit	Line	Data
	1	#!/bin/perl -w
	2	#*******************************************************************
	3	# COPYRIGHT:
	4	# Copyright (c) 2002-2006, International Business Machines Corporation and
	5	# others. All Rights Reserved.
	6	#*******************************************************************
	7
	8	# This script reads in UCD files PropertyAliases.txt and
	9	# PropertyValueAliases.txt and correlates them with ICU enums
	10	# defined in uchar.h and uscript.h. It then outputs a header
	11	# file which contains all names and enums. The header is included
	12	# by the genpname tool C++ source file, which produces the actual
	13	# binary data file.
	14	#
	15	# See usage note below.
	16	#
	17	# TODO: The Property[Value]Alias.txt files state that they can support
	18	# more than 2 names per property\|value. Currently (Unicode 3.2) there
	19	# are always 1 or 2 names. If more names were supported, presumably
	20	# the format would be something like:
	21	# nv ; Numeric_Value
	22	# nv ; Value_Numerique
	23	# CURRENTLY, this script assumes that there are 1 or two names. Any
	24	# duplicates it sees are flagged as an error. If multiple aliases
	25	# appear in a future version of Unicode, modify this script to support
	26	# that.
	27	#
	28	# NOTE: As of ICU 2.6, this script has been modified to know about the
	29	# pseudo-property gcm/General_Category_Mask, which corresponds to the
	30	# uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property
	31	# corresponds to General_Category but is a bitmask value. It does not
	32	# exist in the UCD. Therefore, I special case it in several places
	33	# (search for General_Category_Mask and gcm).
	34	#
	35	# NOTE: As of ICU 2.6, this script reads an auxiliary data file,
	36	# SyntheticPropertyAliases.txt, containing property aliases not
	37	# present in the UCD but present in ICU. This file resides in the
	38	# same directory as this script. Its contents are merged into those
	39	# of PropertyAliases.txt as if the two files were appended.
	40	#
	41	# NOTE: The following names are handled specially. See script below
	42	# for details.
	43	#
	44	# T/True
	45	# F/False
	46	# No_Block
	47	#
	48	# Author: Alan Liu
	49	# Created: October 14 2002
	50	# Since: ICU 2.4
	51
	52	use FileHandle;
	53	use strict;
	54	use Dumpvalue;
	55
	56	my $DEBUG = 1;
	57	my $DUMPER = new Dumpvalue;
	58
	59	my $count = @ARGV;
	60	my $ICU_DIR = shift() \|\| '';
	61	my $OUT_FILE = shift() \|\| 'data.h';
	62	my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
	63	my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
	64
	65	# Get the current year from the system
	66	my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
	67
	68	# Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique
	69	my $propNA = 0;
	70	my $valueNA = 0;
	71
	72	#----------------------------------------------------------------------
	73	# Top level property keys for binary, enumerated, string, and double props
	74	my @TOP = qw( _bp _ep _sp _dp _mp );
	75
	76	# This hash governs how top level properties are grouped into output arrays.
	77	#my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ],
	78	# "NO_VALUE" => [ '_sp', '_dp' ] );m
	79	#my %TOP_PROPS = ( "BINARY" => [ '_bp' ],
	80	# "ENUMERATED" => [ '_ep' ],
	81	# "STRING" => [ '_sp' ],
	82	# "DOUBLE" => [ '_dp' ] );
	83	my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
	84
	85	my %PROP_TYPE = (Binary => "_bp",
	86	String => "_sp",
	87	Double => "_dp",
	88	Enumerated => "_ep",
	89	Bitmask => "_mp");
	90	#----------------------------------------------------------------------
	91
	92	# Properties that are unsupported in ICU
	93	my %UNSUPPORTED = (Composition_Exclusion => 1,
	94	Decomposition_Mapping => 1,
	95	Expands_On_NFC => 1,
	96	Expands_On_NFD => 1,
	97	Expands_On_NFKC => 1,
	98	Expands_On_NFKD => 1,
	99	FC_NFKC_Closure => 1,
	100	ID_Start_Exceptions => 1,
	101	Special_Case_Condition => 1,
	102	);
	103
	104	# Short names of properties that weren't seen in uchar.h. If the
	105	# properties weren't seen, don't complain about the property values
	106	# missing.
	107	my %MISSING_FROM_UCHAR;
	108
	109	# Additional property aliases beyond short and long names,
	110	# like space in addition to WSpace and White_Space in Unicode 4.1.
	111	# Hashtable, maps long name to alias.
	112	# For example, maps White_Space->space.
	113	#
	114	# If multiple additional aliases are defined,
	115	# then they are separated in the value string with '\|'.
	116	# For example, White_Space->space\|outer_space
	117	my %additional_property_aliases;
	118
	119	#----------------------------------------------------------------------
	120
	121	# Emitted class names
	122	my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
	123
	124	if ($count < 1 \|\| $count > 2 \|\|
	125	!-d $HEADER_DIR \|\|
	126	!-d $UNIDATA_DIR) {
	127	my $me = $0;
	128	$me =~ s\|.+[/\\]\|\|;
	129	my $lm = ' ' x length($me);
	130	print <<"END";
	131
	132	$me: Reads ICU4C headers and Unicode data files and creates
	133	$lm a C header file that is included by genpname. The header
	134	$lm file matches constants defined in the ICU4C headers with
	135	$lm property\|value aliases in the Unicode data files.
	136
	137	Usage: $me <icu_dir> [<out_file>]
	138
	139	<icu_dir> ICU4C root directory, containing
	140	source/common/unicode/uchar.h
	141	source/common/unicode/uscript.h
	142	source/data/unidata/Blocks.txt
	143	source/data/unidata/PropertyAliases.txt
	144	source/data/unidata/PropertyValueAliases.txt
	145	<out_file> File name of header to be written;
	146	default is 'data.h'.
	147
	148	The Unicode versions of all input files must match.
	149	END
	150	exit(1);
	151	}
	152
	153	my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
	154
	155	if ($DEBUG) {
	156	print "Merged hash:\n";
	157	for my $key (sort keys %$h) {
	158	my $hh = $h->{$key};
	159	for my $subkey (sort keys %$hh) {
	160	print "$key:$subkey:", $hh->{$subkey}, "\n";
	161	}
	162	}
	163	}
	164
	165	my $out = new FileHandle($OUT_FILE, 'w');
	166	die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
	167	my $save = select($out);
	168	formatData($h, $version);
	169	select($save);
	170	$out->close();
	171
	172	exit(0);
	173
	174	#----------------------------------------------------------------------
	175	# From PropList.html: "The properties of the form Other_XXX
	176	# are used to generate properties in DerivedCoreProperties.txt.
	177	# They are not intended for general use, such as in APIs that
	178	# return property values.
	179	# Non_Break is not a valid property as of 3.2.
	180	sub isIgnoredProperty {
	181	local $_ = shift;
	182	/^Other_/i \|\| /^Non_Break$/i;
	183	}
	184
	185	# 'qc' is a pseudo-property matching any quick-check property
	186	# see PropertyValueAliases.txt file comments. 'binprop' is
	187	# a synthetic binary value alias "True"/"False", not present
	188	# in PropertyValueAliases.txt.
	189	sub isPseudoProperty {
	190	$_[0] eq 'qc' \|\|
	191	$_[0] eq 'binprop';
	192	}
	193
	194	#----------------------------------------------------------------------
	195	# Emit the combined data from headers and the Unicode database as a
	196	# C source code header file.
	197	#
	198	# @param ref to hash with the data
	199	# @param Unicode version, as a string
	200	sub formatData {
	201	my $h = shift;
	202	my $version = shift;
	203
	204	my $date = scalar localtime();
	205	print <<"END";
	206	/**
	207	* Copyright (C) 2002-$YEAR, International Business Machines Corporation and
	208	* others. All Rights Reserved.
	209	*
	210	* MACHINE GENERATED FILE. !!! Do not edit manually !!!
	211	*
	212	* Generated from
	213	* uchar.h
	214	* uscript.h
	215	* Blocks.txt
	216	* PropertyAliases.txt
	217	* PropertyValueAliases.txt
	218	*
	219	* Date: $date
	220	* Unicode version: $version
	221	* Script: $0
	222	*/
	223
	224	END
	225
	226	#------------------------------------------------------------
	227	# Emit Unicode version
	228	print "/* Unicode version $version */\n";
	229	my @v = split(/\./, $version);
	230	push @v, '0' while (@v < 4);
	231	for (my $i=0; $i<@v; ++$i) {
	232	print "const uint8_t VERSION_$i = $v[$i];\n";
	233	}
	234	print "\n";
	235
	236	#------------------------------------------------------------
	237	# Emit String table
	238	# [A table of all identifiers, that is, all long or short property
	239	# or value names. The list need NOT be sorted; it will be sorted
	240	# by the C program. Strings are referenced by their index into
	241	# this table. After sorting, a REMAP[] array is used to map the
	242	# old position indices to the new positions.]
	243	my %strings;
	244	for my $prop (sort keys %$h) {
	245	my $hh = $h->{$prop};
	246	for my $enum (sort keys %$hh) {
	247	my @a = split(/\\|/, $hh->{$enum});
	248	for (@a) {
	249	$strings{$_} = 1 if (length($_));
	250	}
	251	}
	252	}
	253	my @strings = sort keys %strings;
	254	unshift @strings, "";
	255
	256	print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";
	257
	258	# while printing, create a mapping hash from string table entry to index
	259	my %stringToID;
	260	print "/* to be sorted */\n";
	261	print "const $STRING_CLASS STRING_TABLE[] = {\n";
	262	for (my $i=0; $i<@strings; ++$i) {
	263	print " $STRING_CLASS(\"$strings[$i]\", $i),\n";
	264	$stringToID{$strings[$i]} = $i;
	265	}
	266	print "};\n\n";
	267
	268	# placeholder for the remapping index. this is used to map
	269	# indices that we compute here to indices of the sorted
	270	# STRING_TABLE. STRING_TABLE will be sorted by the C++ program
	271	# using the uprv_comparePropertyNames() function. this will
	272	# reshuffle the order. we then use the indices (passed to the
	273	# String constructor) to create a REMAP[] array.
	274	print "/* to be filled in */\n";
	275	print "int32_t REMAP[", scalar @strings, "];\n\n";
	276
	277	#------------------------------------------------------------
	278	# Emit the name group table
	279	# [A table of name groups. A name group is one or more names
	280	# for a property or property value. The Unicode data files specify
	281	# that there may be more than 2, although as of Unicode 3.2 there
	282	# are at most 2. The name group table looks like this:
	283	#
	284	# 114, -115, 116, -117, 0, -118, 65, -64, ...
	285	# [0] [2] [4] [6]
	286	#
	287	# The entry at [0] consists of 2 strings, 114 and 115.
	288	# The entry at [2] consists of 116 and 117. The entry at
	289	# [4] is one string, 118. There is always at least one
	290	# string; typically there are two. If there are two, the first
	291	# is the SHORT name and the second is the LONG. If there is
	292	# one, then the missing entry (always the short name, in 3.2)
	293	# is zero, which is by definition the index of "". The
	294	# 'preferred' name will generally be the LONG name, if there are
	295	# more than 2 entries. The last entry is negative.
	296
	297	# Build name group list and replace string refs with nameGroup indices
	298	my @nameGroups;
	299
	300	# Check for duplicate name groups, and reuse them if possible
	301	my %groupToInt; # Map group strings to ints
	302	for my $prop (sort keys %$h) {
	303	my $hh = $h->{$prop};
	304	for my $enum (sort keys %$hh) {
	305	my $groupString = $hh->{$enum};
	306	my $i;
	307	if (exists $groupToInt{$groupString}) {
	308	$i = $groupToInt{$groupString};
	309	} else {
	310	my @names = split(/\\|/, $groupString);
	311	die "Error: Wrong number of names in " . $groupString if (@names < 1);
	312	$i = @nameGroups; # index of group we are making
	313	$groupToInt{$groupString} = $i; # Cache for reuse
	314	push @nameGroups, map { $stringToID{$_} } @names;
	315	$nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
	316	}
	317	# now, replace string list with ref to name group
	318	$hh->{$enum} = $i;
	319	}
	320	}
	321
	322	print "const int32_t NAME_GROUP_COUNT = ",
	323	scalar @nameGroups, ";\n\n";
	324
	325	print "int32_t NAME_GROUP[] = {\n";
	326	# emit one group per line, with annotations
	327	my $max_names = 0;
	328	for (my $i=0; $i<@nameGroups; ) {
	329	my @a;
	330	my $line;
	331	my $start = $i;
	332	for (;;) {
	333	my $j = $nameGroups[$i++];
	334	$line .= "$j, ";
	335	push @a, abs($j);
	336	last if ($j < 0);
	337	}
	338	print " ",
	339	$line,
	340	' 'x(20-length($line)),
	341	"/* ", sprintf("%3d", $start),
	342	": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
	343	$max_names = @a if(@a > $max_names);
	344
	345	}
	346	print "};\n\n";
	347
	348	# This is fixed for 3.2 at "2" but should be calculated dynamically
	349	# when more than 2 names appear in Property[Value]Aliases.txt.
	350	print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
	351
	352	#------------------------------------------------------------
	353	# Emit enumerated property values
	354	for my $prop (sort keys %$h) {
	355	next if ($prop =~ /^_/);
	356	my $vh = $h->{$prop};
	357	my $count = scalar keys %$vh;
	358
	359	print "const int32_t VALUES_${prop}_COUNT = ",
	360	$count, ";\n\n";
	361
	362	print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
	363	for my $enum (sort keys %$vh) {
	364	#my @names = split(/\\|/, $vh->{$enum});
	365	#die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
	366	# if (@names != 2);
	367	print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
	368	#$stringToID{$names[0]}, ", ",
	369	#$stringToID{$names[1]}, "),\n";
	370	# "\"", $names[0], "\", ",
	371	# "\"", $names[1], "\"),\n";
	372	}
	373	print "};\n\n";
	374	}
	375
	376	#------------------------------------------------------------
	377	# Emit top-level properties (binary, enumerated, etc.)
	378	for my $topName (sort keys %TOP_PROPS) {
	379	my $a = $TOP_PROPS{$topName};
	380	my $count = 0;
	381	for my $type (@$a) { # "_bp", "_ep", etc.
	382	$count += scalar keys %{$h->{$type}};
	383	}
	384
	385	print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
	386
	387	print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
	388
	389	for my $type (@$a) { # "_bp", "_ep", etc.
	390	my $p = $h->{$type};
	391
	392	for my $enum (sort keys %$p) {
	393	my $name = $strings[$nameGroups[$p->{$enum}]];
	394
	395	my $valueRef = "0, NULL";
	396	if ($type eq '_bp') {
	397	$valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
	398	}
	399	elsif (exists $h->{$name}) {
	400	$valueRef = "VALUES_${name}_COUNT, VALUES_$name";
	401	}
	402
	403	print " $PROPERTY_CLASS((int32_t) $enum, ",
	404	$p->{$enum}, ", $valueRef),\n";
	405	}
	406	}
	407	print "};\n\n";
	408	}
	409
	410	print "/eof/\n";
	411	}
	412
	413	#----------------------------------------------------------------------
	414	# Read in the files uchar.h, uscript.h, Blocks.txt,
	415	# PropertyAliases.txt, and PropertyValueAliases.txt,
	416	# and combine them into one hash.
	417	#
	418	# @param directory containing headers
	419	# @param directory containin Unicode data files
	420	#
	421	# @return hash ref, Unicode version
	422	sub readAndMerge {
	423
	424	my ($headerDir, $unidataDir) = @_;
	425
	426	my $h = read_uchar("$headerDir/uchar.h");
	427	my $s = read_uscript("$headerDir/uscript.h");
	428	my $b = read_Blocks("$unidataDir/Blocks.txt");
	429	my $pa = {};
	430	read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
	431	read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
	432	my $va = {};
	433	read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt");
	434	read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt");
	435
	436	# Extract property family hash
	437	my $fam = $pa->{'_family'};
	438	delete $pa->{'_family'};
	439
	440	# Note: uscript.h has no version string, so don't check it
	441	my $version = check_versions([ 'uchar.h', $h ],
	442	[ 'Blocks.txt', $b ],
	443	[ 'PropertyAliases.txt', $pa ],
	444	[ 'PropertyValueAliases.txt', $va ]);
	445
	446	# Do this BEFORE merging; merging modifies the hashes
	447	check_PropertyValueAliases($pa, $va);
	448
	449	# Dump out the $va hash for debugging
	450	if ($DEBUG) {
	451	print "Property values hash:\n";
	452	for my $key (sort keys %$va) {
	453	my $hh = $va->{$key};
	454	for my $subkey (sort keys %$hh) {
	455	print "$key:$subkey:", $hh->{$subkey}, "\n";
	456	}
	457	}
	458	}
	459
	460	# Dump out the $s hash for debugging
	461	if ($DEBUG) {
	462	print "Script hash:\n";
	463	for my $key (sort keys %$s) {
	464	print "$key:", $s->{$key}, "\n";
	465	}
	466	}
	467
	468	# Link in the script data
	469	$h->{'sc'} = $s;
	470
	471	merge_Blocks($h, $b);
	472
	473	merge_PropertyAliases($h, $pa, $fam);
	474
	475	merge_PropertyValueAliases($h, $va);
	476
	477	($h, $version);
	478	}
	479
	480	#----------------------------------------------------------------------
	481	# Ensure that the version strings in the given hashes (under the key
	482	# '_version') are compatible. Currently this means they must be
	483	# identical, with the exception that "X.Y" will match "X.Y.0".
	484	# All hashes must define the key '_version'.
	485	#
	486	# @param a list of pairs of (file name, hash reference)
	487	#
	488	# @return the version of all the hashes. Upon return, the '_version'
	489	# will be removed from all hashes.
	490	sub check_versions {
	491	my $version = '';
	492	my $msg = '';
	493	foreach my $a (@_) {
	494	my $name = $a->[0];
	495	my $h = $a->[1];
	496	die "Error: No version found" unless (exists $h->{'_version'});
	497	my $v = $h->{'_version'};
	498	delete $h->{'_version'};
	499
	500	# append ".0" if necessary, to standardize to X.Y.Z
	501	$v .= '.0' unless ($v =~ /\.\d+\./);
	502	$v .= '.0' unless ($v =~ /\.\d+\./);
	503	$msg .= "$name = $v\n";
	504	if ($version) {
	505	die "Error: Mismatched Unicode versions\n$msg"
	506	unless ($version eq $v);
	507	} else {
	508	$version = $v;
	509	}
	510	}
	511	$version;
	512	}
	513
	514	#----------------------------------------------------------------------
	515	# Make sure the property names in PropertyValueAliases.txt match those
	516	# in PropertyAliases.txt.
	517	#
	518	# @param a hash ref from read_PropertyAliases.
	519	# @param a hash ref from read_PropertyValueAliases.
	520	sub check_PropertyValueAliases {
	521	my ($pa, $va) = @_;
	522
	523	# make a reverse hash of short->long
	524	my %rev;
	525	for (keys %$pa) { $rev{$pa->{$_}} = $_; }
	526
	527	for my $prop (keys %$va) {
	528	if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
	529	print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
	530	}
	531	}
	532	}
	533
	534	#----------------------------------------------------------------------
	535	# Merge blocks data into uchar.h enum data. In the 'blk' subhash all
	536	# code point values, as returned from read_uchar, are replaced by
	537	# block names, as read from Blocks.txt and returned by read_Blocks.
	538	# The match must be 1-to-1. If there is any failure of 1-to-1
	539	# mapping, an error is signaled. Upon return, the read_Blocks hash
	540	# is emptied of all contents, except for those that failed to match.
	541	#
	542	# The mapping in the 'blk' subhash, after this function returns, is
	543	# from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
	544	# pseudo-name, e.g. "Basic Latin".
	545	#
	546	# @param a hash ref from read_uchar.
	547	# @param a hash ref from read_Blocks.
	548	sub merge_Blocks {
	549	my ($h, $b) = @_;
	550
	551	die "Error: No blocks data in uchar.h"
	552	unless (exists $h->{'blk'});
	553	my $blk = $h->{'blk'};
	554	for my $enum (keys %$blk) {
	555	my $cp = $blk->{$enum};
	556	if ($cp && !exists $b->{$cp}) {
	557	die "Error: No block found at $cp in Blocks.txt";
	558	}
	559	# Convert code point to pseudo-name:
	560	$blk->{$enum} = $b->{$cp};
	561	delete $b->{$cp};
	562	}
	563	my $err = '';
	564	for my $cp (keys %$b) {
	565	$err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
	566	}
	567	die $err if ($err);
	568	}
	569
	570	#----------------------------------------------------------------------
	571	# Merge property alias names into the uchar.h hash. The subhashes
	572	# under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
	573	# examined and the values of those subhashes are assumed to be long
	574	# names in PropertyAliases.txt. They are validated and replaced by
	575	# "<short>\|<long>". Upon return, the read_PropertyAliases hash is
	576	# emptied of all contents, except for those that failed to match.
	577	# Unmatched names in PropertyAliases are listed as a warning but do
	578	# NOT cause the script to die.
	579	#
	580	# @param a hash ref from read_uchar.
	581	# @param a hash ref from read_PropertyAliases.
	582	# @param a hash mapping long names to property family (e.g., 'binary')
	583	sub merge_PropertyAliases {
	584	my ($h, $pa, $fam) = @_;
	585
	586	for my $k (@TOP) {
	587	die "Error: No properties data for $k in uchar.h"
	588	unless (exists $h->{$k});
	589	}
	590
	591	for my $subh (map { $h->{$_} } @TOP) {
	592	for my $enum (keys %$subh) {
	593	my $long_name = $subh->{$enum};
	594	if (!exists $pa->{$long_name}) {
	595	die "Error: Property $long_name not found (or used more than once)";
	596	}
	597
	598	my $value;
	599	if($pa->{$long_name} =~ m\|^n/a\d*$\|) {
	600	# replace an "n/a" short name with an empty name (nothing before "\|");
	601	# don't remove it (don't remove the "\|"): there must always be a long name,
	602	# and if the short name is removed, then the long name becomes the
	603	# short name and there is no long name left (unless there is another alias)
	604	$value = "\|" . $long_name;
	605	} else {
	606	$value = $pa->{$long_name} . "\|" . $long_name;
	607	}
	608	if (exists $additional_property_aliases{$long_name}) {
	609	$value .= "\|" . $additional_property_aliases{$long_name};
	610	}
	611	$subh->{$enum} = $value;
	612	delete $pa->{$long_name};
	613	}
	614	}
	615
	616	my @err;
	617	for my $name (keys %$pa) {
	618	$MISSING_FROM_UCHAR{$pa->{$name}} = 1;
	619	if (exists $UNSUPPORTED{$name}) {
	620	push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
	621	} elsif (!isIgnoredProperty($name)) {
	622	push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
	623	}
	624	}
	625	print join("\n", sort @err), "\n" if (@err);
	626	}
	627
	628	#----------------------------------------------------------------------
	629	# Return 1 if two names match ignoring whitespace, '-', and '_'.
	630	# Used to match names in Blocks.txt with those in PropertyValueAliases.txt
	631	# as of Unicode 4.0.
	632	sub matchesLoosely {
	633	my ($a, $b) = @_;
	634	$a =~ s/[\s\-_]//g;
	635	$b =~ s/[\s\-_]//g;
	636	$a =~ /^$b$/i;
	637	}
	638
	639	#----------------------------------------------------------------------
	640	# Merge PropertyValueAliases.txt data into the uchar.h hash. All
	641	# properties other than blk, _bp, and _ep are analyzed and mapped to
	642	# the names listed in PropertyValueAliases. They are then replaced
	643	# with a string of the form "<short>\|<long>". The short or long name
	644	# may be missing.
	645	#
	646	# @param a hash ref from read_uchar.
	647	# @param a hash ref from read_PropertyValueAliases.
	648	sub merge_PropertyValueAliases {
	649	my ($h, $va) = @_;
	650
	651	my %gcCount;
	652	for my $prop (keys %$h) {
	653	# _bp, _ep handled in merge_PropertyAliases
	654	next if ($prop =~ /^_/);
	655
	656	# Special case: gcm
	657	my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
	658
	659	# find corresponding PropertyValueAliases data
	660	die "Error: Can't find $prop in PropertyValueAliases.txt"
	661	unless (exists $va->{$prop2});
	662	my $pva = $va->{$prop2};
	663
	664	# match up data
	665	my $hh = $h->{$prop};
	666	for my $enum (keys %$hh) {
	667
	668	my $name = $hh->{$enum};
	669
	670	# look up both long and short & ignore case
	671	my $n;
	672	if (exists $pva->{$name}) {
	673	$n = $name;
	674	} else {
	675	# iterate (slow)
	676	for my $a (keys %$pva) {
	677	# case-insensitive match
	678	# & case-insensitive reverse match
	679	if ($a =~ /^$name$/i \|\|
	680	$pva->{$a} =~ /^$name$/i) {
	681	$n = $a;
	682	last;
	683	}
	684	}
	685	}
	686
	687	# For blocks, do a loose match from Blocks.txt pseudo-name
	688	# to PropertyValueAliases long name.
	689	if (!$n && $prop eq 'blk') {
	690	for my $a (keys %$pva) {
	691	# The block is only going to match the long name,
	692	# but we check both for completeness. As of Unicode
	693	# 4.0, blocks do not have short names.
	694	if (matchesLoosely($name, $pva->{$a}) \|\|
	695	matchesLoosely($name, $a)) {
	696	$n = $a;
	697	last;
	698	}
	699	}
	700	}
	701
	702	die "Error: Property value $prop:$name not found" unless ($n);
	703
	704	my $l = $n;
	705	my $r = $pva->{$n};
	706	# convert \|n/a\d*\| to blank
	707	$l = '' if ($l =~ m\|^n/a\d*$\|);
	708	$r = '' if ($r =~ m\|^n/a\d*$\|);
	709
	710	$hh->{$enum} = "$l\|$r";
	711	# Don't delete the 'gc' properties because we need to share
	712	# them between 'gc' and 'gcm'. Count each use instead.
	713	if ($prop2 eq 'gc') {
	714	++$gcCount{$n};
	715	} else {
	716	delete $pva->{$n};
	717	}
	718	}
	719	}
	720
	721	# Merge the combining class values in manually
	722	# Add the same values to the synthetic lccc and tccc properties
	723	die "Error: No ccc data"
	724	unless exists $va->{'ccc'};
	725	for my $ccc (keys %{$va->{'ccc'}}) {
	726	die "Error: Can't overwrite ccc $ccc"
	727	if (exists $h->{'ccc'}->{$ccc});
	728	$h->{'lccc'}->{$ccc} =
	729	$h->{'tccc'}->{$ccc} =
	730	$h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
	731	}
	732	delete $va->{'ccc'};
	733
	734	# Merge synthetic binary property values in manually.
	735	# These are the "True" and "False" value aliases.
	736	die "Error: No True/False value aliases"
	737	unless exists $va->{'binprop'};
	738	for my $bp (keys %{$va->{'binprop'}}) {
	739	$h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
	740	}
	741	delete $va->{'binprop'};
	742
	743	my $err = '';
	744	for my $prop (sort keys %$va) {
	745	my $hh = $va->{$prop};
	746	for my $subkey (sort keys %$hh) {
	747	# 'gc' props are shared with 'gcm'; make sure they were used
	748	# once or twice.
	749	if ($prop eq 'gc') {
	750	my $n = $gcCount{$subkey};
	751	next if ($n >= 1 && $n <= 2);
	752	}
	753	$err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
	754	unless exists $MISSING_FROM_UCHAR{$prop};
	755	}
	756	}
	757	print $err if ($err);
	758	}
	759
	760	#----------------------------------------------------------------------
	761	# Read the PropertyAliases.txt file. Return a hash that maps the long
	762	# name to the short name. The special key '_version' will map to the
	763	# Unicode version of the file. The special key '_family' holds a
	764	# subhash that maps long names to a family string, for descriptive
	765	# purposes.
	766	#
	767	# @param a filename for PropertyAliases.txt
	768	# @param reference to hash to receive data. Keys are long names.
	769	# Values are short names.
	770	sub read_PropertyAliases {
	771
	772	my $hash = shift; # result
	773
	774	my $filename = shift;
	775
	776	my $fam = {}; # map long names to family string
	777	$fam = $hash->{'_family'} if (exists $hash->{'_family'});
	778
	779	my $family; # binary, enumerated, etc.
	780
	781	my $in = new FileHandle($filename, 'r');
	782	die "Error: Cannot open $filename" if (!defined $in);
	783
	784	while (<$in>) {
	785
	786	# Read version (embedded in a comment)
	787	if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
	788	die "Error: Multiple versions in $filename"
	789	if (exists $hash->{'_version'});
	790	$hash->{'_version'} = $1;
	791	}
	792
	793	# Read family heading
	794	if (/^\s\#\s(.+?)\sProperties\s$/) {
	795	$family = $1;
	796	}
	797
	798	# Ignore comments and blank lines
	799	s/\#.*//;
	800	next unless (/\S/);
	801
	802	if (/^\s(.+?)\s;/) {
	803	my $short = $1;
	804	my @fields = /;\s*([^\s;]+)/g;
	805	if (@fields < 1 \|\| @fields > 2) {
	806	my $number = @fields;
	807	die "Error: Wrong number of fields ($number) in $filename at $_";
	808	}
	809
	810	# Make "n/a" strings unique
	811	if ($short eq 'n/a') {
	812	$short .= sprintf("%03d", $propNA++);
	813	}
	814	my $long = $fields[0];
	815	if ($long eq 'n/a') {
	816	$long .= sprintf("%03d", $propNA++);
	817	}
	818
	819	# Add long name->short name to the hash=pa hash table
	820	if (exists $hash->{$long}) {
	821	die "Error: Duplicate property $long in $filename"
	822	}
	823	$hash->{$long} = $short;
	824	$fam->{$long} = $family;
	825
	826	# Add the list of further aliases to the additional_property_aliases hash table,
	827	# using the long property name as the key.
	828	# For example:
	829	# White_Space->space\|outer_space
	830	if (@fields > 1) {
	831	my $value = pop @fields;
	832	while (@fields > 1) {
	833	$value .= "\|" . pop @fields;
	834	}
	835	$additional_property_aliases{$long} = $value;
	836	}
	837	} else {
	838	die "Error: Can't parse $_ in $filename";
	839	}
	840	}
	841
	842	$in->close();
	843
	844	$hash->{'_family'} = $fam;
	845	}
	846
	847	#----------------------------------------------------------------------
	848	# Read the PropertyValueAliases.txt file. Return a two level hash
	849	# that maps property_short_name:value_short_name:value_long_name. In
	850	# the case of the 'ccc' property, the short name is the numeric class
	851	# and the long name is "<short>\|<long>". The special key '_version'
	852	# will map to the Unicode version of the file.
	853	#
	854	# @param a filename for PropertyValueAliases.txt
	855	#
	856	# @return a hash reference.
	857	sub read_PropertyValueAliases {
	858
	859	my $hash = shift; # result
	860
	861	my $filename = shift;
	862
	863	my $in = new FileHandle($filename, 'r');
	864	die "Error: Cannot open $filename" if (!defined $in);
	865
	866	while (<$in>) {
	867
	868	# Read version (embedded in a comment)
	869	if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
	870	die "Error: Multiple versions in $filename"
	871	if (exists $hash->{'_version'});
	872	$hash->{'_version'} = $1;
	873	}
	874
	875	# Ignore comments and blank lines
	876	s/\#.*//;
	877	next unless (/\S/);
	878
	879	if (/^\s(.+?)\s;/i) {
	880	my $prop = $1;
	881	my @fields = /;\s*([^\s;]+)/g;
	882	die "Error: Wrong number of fields in $filename"
	883	if (@fields < 2 \|\| @fields > 3);
	884	# Make "n/a" strings unique
	885	$fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
	886	# Squash extra fields together
	887	while (@fields > 2) {
	888	my $f = pop @fields;
	889	$fields[$#fields] .= '\|' . $f;
	890	}
	891	addDatum($hash, $prop, @fields);
	892	}
	893
	894	else {
	895	die "Error: Can't parse $_ in $filename";
	896	}
	897	}
	898
	899	$in->close();
	900
	901	# Script Copt=Qaac (Coptic) is a special case.
	902	# Before the Copt code was defined, the private-use code Qaac was used.
	903	# Starting with Unicode 4.1, PropertyValueAliases.txt contains
	904	# Copt as the short name as well as Qaac as an alias.
	905	# For use with older Unicode data files, we add here a Qaac->Coptic entry.
	906	# This should not do anything for 4.1-and-later Unicode data files.
	907	# See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/
	908	$hash->{'sc'}->{'Qaac'} = 'Coptic'
	909	unless (exists $hash->{'sc'}->{'Qaac'} \|\| exists $hash->{'sc'}->{'Copt'});
	910
	911	# Add T\|True and F\|False -- these are values we recognize for
	912	# binary properties (NOT from PropertyValueAliases.txt). These
	913	# are of the same form as the 'ccc' value aliases.
	914	$hash->{'binprop'}->{'0'} = 'F\|False';
	915	$hash->{'binprop'}->{'1'} = 'T\|True';
	916	}
	917
	918	#----------------------------------------------------------------------
	919	# Read the Blocks.txt file. Return a hash that maps the code point
	920	# range start to the block name. The special key '_version' will map
	921	# to the Unicode version of the file.
	922	#
	923	# As of Unicode 4.0, the names in the Blocks.txt are no longer the
	924	# proper names. The proper names are now listed in PropertyValueAliases.
	925	# They are similar but not identical. Furthermore, 4.0 introduces
	926	# a new block name, No_Block, which is listed only in PropertyValueAliases
	927	# and not in Blocks.txt. As a result, we handle blocks as follows:
	928	#
	929	# 1. Read Blocks.txt to map code point range start to quasi-block name.
	930	# 2. Add to Blocks.txt a synthetic No Block code point & name:
	931	# X -> No Block
	932	# 3. Map quasi-names from Blocks.txt (including No Block) to actual
	933	# names from PropertyValueAliases. This occurs in
	934	# merge_PropertyValueAliases.
	935	#
	936	# @param a filename for Blocks.txt
	937	#
	938	# @return a ref to a hash. Keys are code points, as text, e.g.,
	939	# "1720". Values are pseudo-block names, e.g., "Hanunoo".
	940	sub read_Blocks {
	941
	942	my $filename = shift;
	943
	944	my $hash = {}; # result
	945
	946	my $in = new FileHandle($filename, 'r');
	947	die "Error: Cannot open $filename" if (!defined $in);
	948
	949	while (<$in>) {
	950
	951	# Read version (embedded in a comment)
	952	if (/Blocks-(\d+\.\d+\.\d+)/i) {
	953	die "Error: Multiple versions in $filename"
	954	if (exists $hash->{'_version'});
	955	$hash->{'_version'} = $1;
	956	}
	957
	958	# Ignore comments and blank lines
	959	s/\#.*//;
	960	next unless (/\S/);
	961
	962	if (/^([0-9a-f]+)\.\.[0-9a-f]+\s;\s(.+?)\s*$/i) {
	963	die "Error: Duplicate range $1 in $filename"
	964	if (exists $hash->{$1});
	965	$hash->{$1} = $2;
	966	}
	967
	968	else {
	969	die "Error: Can't parse $_ in $filename";
	970	}
	971	}
	972
	973	$in->close();
	974
	975	# Add pseudo-name for No Block
	976	$hash->{'none'} = 'No Block';
	977
	978	$hash;
	979	}
	980
	981	#----------------------------------------------------------------------
	982	# Read the uscript.h file and compile a mapping of Unicode symbols to
	983	# icu4c enum values.
	984	#
	985	# @param a filename for uscript.h
	986	#
	987	# @return a ref to a hash. The keys of the hash are enum symbols from
	988	# uscript.h, and the values are script names.
	989	sub read_uscript {
	990
	991	my $filename = shift;
	992
	993	my $mode = ''; # state machine mode and submode
	994	my $submode = '';
	995
	996	my $last = ''; # for line folding
	997
	998	my $hash = {}; # result
	999	my $key; # first-level key
	1000
	1001	my $in = new FileHandle($filename, 'r');
	1002	die "Error: Cannot open $filename" if (!defined $in);
	1003
	1004	while (<$in>) {
	1005	# Fold continued lines together
	1006	if (/^(.*)\\$/) {
	1007	$last = $1;
	1008	next;
	1009	} elsif ($last) {
	1010	$_ = $last . $_;
	1011	$last = '';
	1012	}
	1013
	1014	# Exit all modes here
	1015	if ($mode && $mode ne 'DEPRECATED') {
	1016	if (/^\s*\}/) {
	1017	$mode = '';
	1018	next;
	1019	}
	1020	}
	1021
	1022	# Handle individual modes
	1023
	1024	if ($mode eq 'UScriptCode') {
	1025	if (m\|^\s(USCRIPT_\w+).+?/\\s*(\w+)\|) {
	1026	my ($enum, $code) = ($1, $2);
	1027	die "Error: Duplicate script $enum"
	1028	if (exists $hash->{$enum});
	1029	$hash->{$enum} = $code;
	1030	}
	1031	}
	1032
	1033	elsif ($mode eq 'DEPRECATED') {
	1034	if (/\s*\#ifdef/) {
	1035	die "Error: Nested #ifdef";
	1036	}
	1037	elsif (/\s*\#endif/) {
	1038	$mode = '';
	1039	}
	1040	}
	1041
	1042	elsif (!$mode) {
	1043	if (/^\stypedef\s+enum\s+(\w+)\s\{/ \|\|
	1044	/^\stypedef\s+enum\s+(\w+)\s$/) {
	1045	$mode = $1;
	1046	#print "Parsing $mode\n";
	1047	}
	1048
	1049	elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
	1050	$mode = 'DEPRECATED';
	1051	}
	1052	}
	1053	}
	1054
	1055	$in->close();
	1056
	1057	$hash;
	1058	}
	1059
	1060	#----------------------------------------------------------------------
	1061	# Read the uchar.h file and compile a mapping of Unicode symbols to
	1062	# icu4c enum values.
	1063	#
	1064	# @param a filename for uchar.h
	1065	#
	1066	# @return a ref to a hash. The keys of the hash are '_bp' for binary
	1067	# properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
	1068	# double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
	1069	# 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
	1070	# value aliases. The values of the hash are subhashes. The subhashes
	1071	# have a key of the uchar.h enum symbol, and a value of the alias
	1072	# string (as listed in PropertyValueAliases.txt). NOTE: The alias
	1073	# string is whatever alias uchar.h lists. This may be either short or
	1074	# long, depending on the specific enum. NOTE: For blocks ('blk'), the
	1075	# value is a hex code point for the start of the associated block.
	1076	# NOTE: The special key _version will map to the Unicode version of
	1077	# the file.
	1078	sub read_uchar {
	1079
	1080	my $filename = shift;
	1081
	1082	my $mode = ''; # state machine mode and submode
	1083	my $submode = '';
	1084
	1085	my $last = ''; # for line folding
	1086
	1087	my $hash = {}; # result
	1088	my $key; # first-level key
	1089
	1090	my $in = new FileHandle($filename, 'r');
	1091	die "Error: Cannot open $filename" if (!defined $in);
	1092
	1093	while (<$in>) {
	1094	# Fold continued lines together
	1095	if (/^(.*)\\$/) {
	1096	$last .= $1;
	1097	next;
	1098	} elsif ($last) {
	1099	$_ = $last . $_;
	1100	$last = '';
	1101	}
	1102
	1103	# Exit all modes here
	1104	if ($mode && $mode ne 'DEPRECATED') {
	1105	if (/^\s*\}/) {
	1106	$mode = '';
	1107	next;
	1108	}
	1109	}
	1110
	1111	# Handle individual modes
	1112
	1113	if ($mode eq 'UProperty') {
	1114	if (/^\s(UCHAR_\w+)\s[,=]/ \|\| /^\s+(UCHAR_\w+)\s*$/) {
	1115	if ($submode) {
	1116	addDatum($hash, $key, $1, $submode);
	1117	$submode = '';
	1118	} else {
	1119	#print "Warning: Ignoring $1\n";
	1120	}
	1121	}
	1122
	1123	elsif (m\|^\s/\\\s(\w+)\s+property\s+(\w+)\|i) {
	1124	die "Error: Unmatched tag $submode" if ($submode);
	1125	die "Error: Unrecognized UProperty comment: $_"
	1126	unless (exists $PROP_TYPE{$1});
	1127	$key = $PROP_TYPE{$1};
	1128	$submode = $2;
	1129	}
	1130	}
	1131
	1132	elsif ($mode eq 'UCharCategory') {
	1133	if (/^\s(U_\w+)\s=/) {
	1134	if ($submode) {
	1135	addDatum($hash, 'gc', $1, $submode);
	1136	$submode = '';
	1137	} else {
	1138	#print "Warning: Ignoring $1\n";
	1139	}
	1140	}
	1141
	1142	elsif (m\|^\s/\\\s([A-Z][a-z])\s\|) {
	1143	die "Error: Unmatched tag $submode" if ($submode);
	1144	$submode = $1;
	1145	}
	1146	}
	1147
	1148	elsif ($mode eq 'UCharDirection') {
	1149	if (/^\s(U_\w+)\s[,=]/ \|\| /^\s+(U_\w+)\s*$/) {
	1150	if ($submode) {
	1151	addDatum($hash, $key, $1, $submode);
	1152	$submode = '';
	1153	} else {
	1154	#print "Warning: Ignoring $1\n";
	1155	}
	1156	}
	1157
	1158	elsif (m\|/\\\s*([A-Z]+)\s\|) {
	1159	die "Error: Unmatched tag $submode" if ($submode);
	1160	$key = 'bc';
	1161	$submode = $1;
	1162	}
	1163	}
	1164
	1165	elsif ($mode eq 'UBlockCode') {
	1166	if (m\|^\s(UBLOCK_\w+).+?/\\[(.+?)\]\*/\|) {
	1167	addDatum($hash, 'blk', $1, $2);
	1168	}
	1169	}
	1170
	1171	elsif ($mode eq 'UEastAsianWidth') {
	1172	if (m\|^\s(U_EA_\w+).+?/\\[(.+?)\]\*/\|) {
	1173	addDatum($hash, 'ea', $1, $2);
	1174	}
	1175	}
	1176
	1177	elsif ($mode eq 'UDecompositionType') {
	1178	if (m\|^\s(U_DT_\w+).+?/\\[(.+?)\]\*/\|) {
	1179	addDatum($hash, 'dt', $1, $2);
	1180	}
	1181	}
	1182
	1183	elsif ($mode eq 'UJoiningType') {
	1184	if (m\|^\s(U_JT_\w+).+?/\\[(.+?)\]\*/\|) {
	1185	addDatum($hash, 'jt', $1, $2);
	1186	}
	1187	}
	1188
	1189	elsif ($mode eq 'UJoiningGroup') {
	1190	if (/^\s*(U_JG_(\w+))/) {
	1191	addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
	1192	}
	1193	}
	1194
	1195	elsif ($mode eq 'UGraphemeClusterBreak') {
	1196	if (m\|^\s(U_GCB_\w+).+?/\\[(.+?)\]\*/\|) {
	1197	addDatum($hash, 'GCB', $1, $2);
	1198	}
	1199	}
	1200
	1201	elsif ($mode eq 'UWordBreakValues') {
	1202	if (m\|^\s(U_WB_\w+).+?/\\[(.+?)\]\*/\|) {
	1203	addDatum($hash, 'WB', $1, $2);
	1204	}
	1205	}
	1206
	1207	elsif ($mode eq 'USentenceBreak') {
	1208	if (m\|^\s(U_SB_\w+).+?/\\[(.+?)\]\*/\|) {
	1209	addDatum($hash, 'SB', $1, $2);
	1210	}
	1211	}
	1212
	1213	elsif ($mode eq 'ULineBreak') {
	1214	if (m\|^\s(U_LB_\w+).+?/\\[(.+?)\]\*/\|) {
	1215	addDatum($hash, 'lb', $1, $2);
	1216	}
	1217	}
	1218
	1219	elsif ($mode eq 'UNumericType') {
	1220	if (m\|^\s(U_NT_\w+).+?/\\[(.+?)\]\*/\|) {
	1221	addDatum($hash, 'nt', $1, $2);
	1222	}
	1223	}
	1224
	1225	elsif ($mode eq 'UHangulSyllableType') {
	1226	if (m\|^\s(U_HST_\w+).+?/\\[(.+?)\]\*/\|) {
	1227	addDatum($hash, 'hst', $1, $2);
	1228	}
	1229	}
	1230
	1231	elsif ($mode eq 'DEPRECATED') {
	1232	if (/\s*\#ifdef/) {
	1233	die "Error: Nested #ifdef";
	1234	}
	1235	elsif (/\s*\#endif/) {
	1236	$mode = '';
	1237	}
	1238	}
	1239
	1240	elsif (!$mode) {
	1241	if (/^\s*\#define\s+(\w+)\s+(.+)/) {
	1242	# #define $left $right
	1243	my ($left, $right) = ($1, $2);
	1244
	1245	if ($left eq 'U_UNICODE_VERSION') {
	1246	my $version = $right;
	1247	$version = $1 if ($version =~ /^\"(.*)\"/);
	1248	# print "Unicode version: ", $version, "\n";
	1249	die "Error: Multiple versions in $filename"
	1250	if (defined $hash->{'_version'});
	1251	$hash->{'_version'} = $version;
	1252	}
	1253
	1254	elsif ($left =~ /U_GC_(\w+?)_MASK/) {
	1255	addDatum($hash, 'gcm', $left, $1);
	1256	}
	1257	}
	1258
	1259	elsif (/^\stypedef\s+enum\s+(\w+)\s\{/ \|\|
	1260	/^\stypedef\s+enum\s+(\w+)\s$/) {
	1261	$mode = $1;
	1262	#print "Parsing $mode\n";
	1263	}
	1264
	1265	elsif (/^\senum\s+(\w+)\s\{/ \|\|
	1266	/^\senum\s+(\w+)\s$/) {
	1267	$mode = $1;
	1268	#print "Parsing $mode\n";
	1269	}
	1270
	1271	elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
	1272	$mode = 'DEPRECATED';
	1273	}
	1274	}
	1275	}
	1276
	1277	$in->close();
	1278
	1279	# hardcode known values for the normalization quick check properties
	1280	# see unorm.h for the UNormalizationCheckResult enum
	1281
	1282	addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N');
	1283	addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y');
	1284	addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
	1285
	1286	addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N');
	1287	addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y');
	1288	addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
	1289
	1290	# no "maybe" values for NF[K]D
	1291
	1292	addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N');
	1293	addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y');
	1294
	1295	addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N');
	1296	addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y');
	1297
	1298	$hash;
	1299	}
	1300
	1301	#----------------------------------------------------------------------
	1302	# Add a new value to a two-level hash. That is, given a ref to
	1303	# a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
	1304	sub addDatum {
	1305	my ($h, $k1, $k2, $v) = @_;
	1306	if (exists $h->{$k1}->{$k2}) {
	1307	die "Error: $k1:$k2 already set to " .
	1308	$h->{$k1}->{$k2} . ", cannot set to " . $v;
	1309	}
	1310	$h->{$k1}->{$k2} = $v;
	1311	}
	1312
	1313	#eof