#**************************************************************************
-# Copyright (C) 2002-2004 International Business Machines Corporation *
-# and others. All rights reserved. *
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html#License
+#**************************************************************************
+#**************************************************************************
+# Copyright (C) 2002-2016 International Business Machines Corporation
+# and others. All rights reserved.
#**************************************************************************
#
# rbbicst Compile the RBBI rule paser state table data into initialized C data.
# Usage:
# cd icu/source/common
-# perl rbbicst.pl [-j] < rbbirpt.txt > rbbirpt.h
+# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
+# perl rbbicst.pl -j < rbbirpt.txt > RBBIRuleParseTable.java
#
# The output file, rbbrpt.h, is included by some of the .cpp rbbi
# implementation files. This perl script is NOT run as part
# of a normal ICU build. It is run by hand when needed, and the
# rbbirpt.h generated file is put back into cvs.
#
-# See rbbirpt.h for a description of the input format for this script.
+# See rbbirpt.txt for a description of the input format for this script.
#
if ($ARGV[0] eq "-j") {
#
# do the 'n' flag
#
- $state_flag[$num_states] = "FALSE";
+ $state_flag[$num_states] = $javaOutput? "false" : "FALSE";
if ($fields[0] eq "n") {
- $state_flag[$num_states] = "TRUE";
+ $state_flag[$num_states] = $javaOutput? "true": "TRUE";
shift @fields;
}
die if ($errors>0);
+#
+# Assign numbers to each of the character classes classes used.
+# Sets are numbered from 128 - 250
+# The values 0-127 in the state table are used for matching
+# individual ASCII characters (the only thing that can appear in the rules.)
+# The "set" names appearing in the code below (default, etc.) need special
+# handling because they do not correspond to a normal set of characters,
+# but trigger special handling by code in the state machine.
+#
+$i = 128;
+foreach $setName (sort keys %charClasses) {
+ if ($setName eq "default") {
+ $charClasses{$setName} = 255;}
+ elsif ($setName eq "escaped") {
+ $charClasses{$setName} = 254;}
+ elsif ($setName eq "escapedP") {
+ $charClasses{$setName} = 253;}
+ elsif ($setName eq "eof") {
+ $charClasses{$setName} = 252;}
+ else {
+ # Normal (single) character class. Number them.
+ $charClasses{$setName} = $i;
+ $i++;
+ }
+}
+
+
my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime;
$year += 1900;
print " * rule parser.\n";
print " * It is generated by the Perl script \"rbbicst.pl\" from\n";
print " * the rule parser state definitions file \"rbbirpt.txt\".\n";
+ print " * \@internal \n";
print " *\n";
print " */\n";
- print "public class RuleBasedBreakIteratorStateTable\n";
+ print "class RBBIRuleParseTable\n";
print "{\n";
- #
+ #
# Emit the constants for the actions to be performed.
#
$n = 1;
- foreach $act (keys %actions) {
- print " public static final int $act = $n;\n";
+ foreach $act (sort keys %actions) {
+ print " static final short $act = $n;\n";
$n++;
}
print " \n";
+
#
- # emit the state transition table
+ # Emit constants for char class names
#
- print " public static final String[] gRuleParseStateTable = {\n";
- printf(" \"\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\"\n", doNOP, 0, 0, 0, 1);
+ foreach $setName (sort keys %charClasses) {
+ print " static final short kRuleSet_$setName = $charClasses{$setName};\n";
+ }
+ print "\n\n";
+
+
+ print " static class RBBIRuleTableElement { \n";
+ print " short fAction; \n";
+ print " short fCharClass; \n";
+ print " short fNextState; \n";
+ print " short fPushState; \n";
+ print " boolean fNextChar; \n";
+ print " String fStateName; \n";
+ print " RBBIRuleTableElement(short a, int cc, int ns, int ps, boolean nc, String sn) { \n";
+ print " fAction = a; \n";
+ print " fCharClass = (short)cc; \n";
+ print " fNextState = (short)ns; \n";
+ print " fPushState = (short)ps; \n";
+ print " fNextChar = nc; \n";
+ print " fStateName = sn; \n";
+ print " } \n";
+ print " }; \n";
+ print " \n";
+
+
+ print " static RBBIRuleTableElement[] gRuleParseStateTable = { \n ";
+ print " new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0 \n"; #output the unused state 0.
for ($state=1; $state < $num_states; $state++) {
- printf(" , \"\\u%04.4x", $state_func_name[$state]);
- # print " , {$state_func_name[$state],";
+ print " , new RBBIRuleTableElement($state_func_name[$state],";
if ($state_literal_chars[$state] ne "") {
- printf("\\u%04.4x", $state_func_name[$state]);
+ $c = $state_literal_chars[$state];
+ print("'$c', ");
}else {
- printf("\\u%04.4x", $charClasses{$state_char_class[$state]});
+ print " $charClasses{$state_char_class[$state]},";
}
- printf("\\u%04.4x", $states{$state_dest_state[$state]});
-
+ print " $states{$state_dest_state[$state]},";
+
# The push-state field is optional. If omitted, fill field with a zero, which flags
# the state machine that there is no push state.
if ($state_push_state[$state] eq "") {
- print "\\u0000";
+ print "0, ";
} else {
- printf("\\u%04.4x", $states{$state_push_state[$state]});
+ print " $states{$state_push_state[$state]},";
}
- printf("\\u%04.4x", $state_flag[$state]);
-
- # For the first row of each state, append the state name.
- # Used for debugging only.
+ print " $state_flag[$state], ";
+
+ # if this is the first row of the table for this state, put out the state name.
if ($stateNames[$state] ne "") {
- printf("%-20s", $stateNames[$state]."\"");
+ print " \"$stateNames[$state]\") ";
} else {
- printf("%-20s", "\"");
+ print " null ) ";
}
-
- # Put out a C++ comment showing the number (index) of this state row,
- print " // $state ";
+
+ # Put out a comment showing the number (index) of this state row,
+ print " // $state ";
print "\n";
- };
- print " };\n";
- print "}\n";
+ }
+ print " };\n";
+
+ print "}; \n";
+
}
else
{
print "#ifndef RBBIRPT_H\n";
print "#define RBBIRPT_H\n";
print "\n";
+ print "#include \"unicode/utypes.h\"\n";
+ print "\n";
print "U_NAMESPACE_BEGIN\n";
#
print "//\n";
print "// Character classes for RBBI rule scanning.\n";
print "//\n";
- $i = 128; # State Table values for Unicode char sets range from 128-250.
- # Sets "default", "escaped", etc. get special handling.
- # They have no corresponding UnicodeSet object in the state machine,
- # but are handled by special case code. So we emit no reference
- # to a UnicodeSet object to them here.
- foreach $setName (keys %charClasses) {
- if ($setName eq "default") {
- $charClasses{$setName} = 255;}
- elsif ($setName eq "escaped") {
- $charClasses{$setName} = 254;}
- elsif ($setName eq "escapedP") {
- $charClasses{$setName} = 253;}
- elsif ($setName eq "eof") {
- $charClasses{$setName} = 252;}
- else {
- # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
- print " static const uint8_t kRuleSet_$setName = $i;\n";
- $charClasses{$setName} = $i;
- $i++;
+ foreach $setName (sort keys %charClasses) {
+ if ($charClasses{$setName} < 250) {
+ # Normal character class.
+ print " static const uint8_t kRuleSet_$setName = $charClasses{$setName};\n";
}
}
print "\n\n";
# Emit the enum for the actions to be performed.
#
print "enum RBBI_RuleParseAction {\n";
- foreach $act (keys %actions) {
+ foreach $act (sort keys %actions) {
print " $act,\n";
}
print " rbbiLastAction};\n\n";