#!/bin/perl -w
#*******************************************************************
# COPYRIGHT:
-# Copyright (c) 2002-2003, International Business Machines Corporation and
+# Copyright (c) 2002-2006, International Business Machines Corporation and
# others. All Rights Reserved.
#*******************************************************************
use strict;
use Dumpvalue;
-my $DEBUG = 0;
+my $DEBUG = 1;
my $DUMPER = new Dumpvalue;
my $count = @ARGV;
# Get the current year from the system
my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
+# Used to make "n/a" property [value] aliases (Unicode or Synthetic) unique
+my $propNA = 0;
+my $valueNA = 0;
+
#----------------------------------------------------------------------
# Top level property keys for binary, enumerated, string, and double props
my @TOP = qw( _bp _ep _sp _dp _mp );
Expands_On_NFKD => 1,
FC_NFKC_Closure => 1,
ID_Start_Exceptions => 1,
- NFC_Quick_Check => 1,
- NFD_Quick_Check => 1,
- NFKC_Quick_Check => 1,
- NFKD_Quick_Check => 1,
Special_Case_Condition => 1,
);
# missing.
my %MISSING_FROM_UCHAR;
+# Additional property aliases beyond short and long names,
+# like space in addition to WSpace and White_Space in Unicode 4.1.
+# Hashtable, maps long name to alias.
+# For example, maps White_Space->space.
+#
+# If multiple additional aliases are defined,
+# then they are separated in the value string with '|'.
+# For example, White_Space->space|outer_space
+my %additional_property_aliases;
+
#----------------------------------------------------------------------
# Emitted class names
$i = $groupToInt{$groupString};
} else {
my @names = split(/\|/, $groupString);
- die "Error: Wrong number of names in " . $groupString if (@names != 2);
+ die "Error: Wrong number of names in " . $groupString if (@names < 1);
$i = @nameGroups; # index of group we are making
$groupToInt{$groupString} = $i; # Cache for reuse
push @nameGroups, map { $stringToID{$_} } @names;
print "int32_t NAME_GROUP[] = {\n";
# emit one group per line, with annotations
+ my $max_names = 0;
for (my $i=0; $i<@nameGroups; ) {
my @a;
my $line;
' 'x(20-length($line)),
"/* ", sprintf("%3d", $start),
": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
+ $max_names = @a if(@a > $max_names);
+
}
print "};\n\n";
# This is fixed for 3.2 at "2" but should be calculated dynamically
# when more than 2 names appear in Property[Value]Aliases.txt.
- print "#define MAX_NAMES_PER_GROUP 2\n\n";
+ print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
#------------------------------------------------------------
# Emit enumerated property values
my $pa = {};
read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
- my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt");
+ my $va = {};
+ read_PropertyValueAliases($va, "$unidataDir/PropertyValueAliases.txt");
+ read_PropertyValueAliases($va, "SyntheticPropertyValueAliases.txt");
# Extract property family hash
my $fam = $pa->{'_family'};
for my $subh (map { $h->{$_} } @TOP) {
for my $enum (keys %$subh) {
- my $name = $subh->{$enum};
- die "Error: Property $name not found (or used more than once)"
- unless (exists $pa->{$name});
+ my $long_name = $subh->{$enum};
+ if (!exists $pa->{$long_name}) {
+ die "Error: Property $long_name not found (or used more than once)";
+ }
- $subh->{$enum} = $pa->{$name} . "|" . $name;
- delete $pa->{$name};
+ my $value;
+ if($pa->{$long_name} =~ m|^n/a\d*$|) {
+ # replace an "n/a" short name with an empty name (nothing before "|");
+ # don't remove it (don't remove the "|"): there must always be a long name,
+ # and if the short name is removed, then the long name becomes the
+ # short name and there is no long name left (unless there is another alias)
+ $value = "|" . $long_name;
+ } else {
+ $value = $pa->{$long_name} . "|" . $long_name;
+ }
+ if (exists $additional_property_aliases{$long_name}) {
+ $value .= "|" . $additional_property_aliases{$long_name};
+ }
+ $subh->{$enum} = $value;
+ delete $pa->{$long_name};
}
}
+
my @err;
for my $name (keys %$pa) {
$MISSING_FROM_UCHAR{$pa->{$name}} = 1;
# look up both long and short & ignore case
my $n;
if (exists $pva->{$name}) {
- $n = $name;
+ $n = $name;
} else {
# iterate (slow)
for my $a (keys %$pva) {
}
}
}
-
+
# For blocks, do a loose match from Blocks.txt pseudo-name
# to PropertyValueAliases long name.
if (!$n && $prop eq 'blk') {
my $l = $n;
my $r = $pva->{$n};
- # convert |n/a\d+| to blank
- $l = '' if ($l =~ m|^n/a\d+$|);
- $r = '' if ($r =~ m|^n/a\d+$|);
+ # convert |n/a\d*| to blank
+ $l = '' if ($l =~ m|^n/a\d*$|);
+ $r = '' if ($r =~ m|^n/a\d*$|);
$hh->{$enum} = "$l|$r";
# Don't delete the 'gc' properties because we need to share
}
# Merge the combining class values in manually
+ # Add the same values to the synthetic lccc and tccc properties
die "Error: No ccc data"
unless exists $va->{'ccc'};
for my $ccc (keys %{$va->{'ccc'}}) {
die "Error: Can't overwrite ccc $ccc"
if (exists $h->{'ccc'}->{$ccc});
+ $h->{'lccc'}->{$ccc} =
+ $h->{'tccc'}->{$ccc} =
$h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
}
delete $va->{'ccc'};
s/\#.*//;
next unless (/\S/);
- if (/^\s*(.+?)\s*;\s*(.+?)\s*$/i) {
- die "Error: Duplicate property $1 in $filename"
- if (exists $hash->{$2});
- $hash->{$2} = $1;
- $fam->{$2} = $family;
- }
+ if (/^\s*(.+?)\s*;/) {
+ my $short = $1;
+ my @fields = /;\s*([^\s;]+)/g;
+ if (@fields < 1 || @fields > 2) {
+ my $number = @fields;
+ die "Error: Wrong number of fields ($number) in $filename at $_";
+ }
- else {
+ # Make "n/a" strings unique
+ if ($short eq 'n/a') {
+ $short .= sprintf("%03d", $propNA++);
+ }
+ my $long = $fields[0];
+ if ($long eq 'n/a') {
+ $long .= sprintf("%03d", $propNA++);
+ }
+
+ # Add long name->short name to the hash=pa hash table
+ if (exists $hash->{$long}) {
+ die "Error: Duplicate property $long in $filename"
+ }
+ $hash->{$long} = $short;
+ $fam->{$long} = $family;
+
+ # Add the list of further aliases to the additional_property_aliases hash table,
+ # using the long property name as the key.
+ # For example:
+ # White_Space->space|outer_space
+ if (@fields > 1) {
+ my $value = pop @fields;
+ while (@fields > 1) {
+ $value .= "|" . pop @fields;
+ }
+ $additional_property_aliases{$long} = $value;
+ }
+ } else {
die "Error: Can't parse $_ in $filename";
}
}
# @return a hash reference.
sub read_PropertyValueAliases {
- my $filename = shift;
+ my $hash = shift; # result
- my $hash = {}; # result
+ my $filename = shift;
my $in = new FileHandle($filename, 'r');
die "Error: Cannot open $filename" if (!defined $in);
- my $sym = 0; # Used to make "n/a" strings unique
-
while (<$in>) {
# Read version (embedded in a comment)
if (/^\s*(.+?)\s*;/i) {
my $prop = $1;
my @fields = /;\s*([^\s;]+)/g;
- die "Error: Wrong number of fields"
+ die "Error: Wrong number of fields in $filename"
if (@fields < 2 || @fields > 3);
# Make "n/a" strings unique
- $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a');
+ $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
# Squash extra fields together
while (@fields > 2) {
my $f = pop @fields;
$in->close();
- # Script Qaac (Coptic) is a special case. Handle it here. See UTR#24:
- # http://www.unicode.org/unicode/reports/tr24/
+ # Script Copt=Qaac (Coptic) is a special case.
+ # Before the Copt code was defined, the private-use code Qaac was used.
+ # Starting with Unicode 4.1, PropertyValueAliases.txt contains
+ # Copt as the short name as well as Qaac as an alias.
+ # For use with older Unicode data files, we add here a Qaac->Coptic entry.
+ # This should not do anything for 4.1-and-later Unicode data files.
+ # See also UAX #24: Script Names http://www.unicode.org/unicode/reports/tr24/
$hash->{'sc'}->{'Qaac'} = 'Coptic'
- unless (exists $hash->{'sc'}->{'Qaac'});
+ unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'});
# Add T|True and F|False -- these are values we recognize for
# binary properties (NOT from PropertyValueAliases.txt). These
# are of the same form as the 'ccc' value aliases.
$hash->{'binprop'}->{'0'} = 'F|False';
$hash->{'binprop'}->{'1'} = 'T|True';
-
- $hash;
}
#----------------------------------------------------------------------
s/\#.*//;
next unless (/\S/);
- if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) {
+ if (/^([0-9a-f]+)\.\.[0-9a-f]+\s*;\s*(.+?)\s*$/i) {
die "Error: Duplicate range $1 in $filename"
if (exists $hash->{$1});
$hash->{$1} = $2;
}
}
+ elsif ($mode eq 'UGraphemeClusterBreak') {
+ if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) {
+ addDatum($hash, 'GCB', $1, $2);
+ }
+ }
+
+ elsif ($mode eq 'UWordBreakValues') {
+ if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) {
+ addDatum($hash, 'WB', $1, $2);
+ }
+ }
+
+ elsif ($mode eq 'USentenceBreak') {
+ if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) {
+ addDatum($hash, 'SB', $1, $2);
+ }
+ }
+
elsif ($mode eq 'ULineBreak') {
if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
addDatum($hash, 'lb', $1, $2);
$in->close();
+ # hardcode known values for the normalization quick check properties
+ # see unorm.h for the UNormalizationCheckResult enum
+
+ addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N');
+ addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y');
+ addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
+
+ addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N');
+ addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y');
+ addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
+
+ # no "maybe" values for NF[K]D
+
+ addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N');
+ addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y');
+
+ addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N');
+ addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y');
+
$hash;
}