-#**************************************************************************
-# Copyright (C) 2002-2004 International Business Machines Corporation *
-# and others. All rights reserved. *
-#**************************************************************************
-#
-# rbbicst Compile the RBBI rule paser state table data into initialized C data.
-# Usage:
-# cd icu/source/common
-# perl rbbicst.pl [-j] < rbbirpt.txt > rbbirpt.h
-#
-# The output file, rbbrpt.h, is included by some of the .cpp rbbi
-# implementation files. This perl script is NOT run as part
-# of a normal ICU build. It is run by hand when needed, and the
-# rbbirpt.h generated file is put back into cvs.
-#
-# See rbbirpt.h for a description of the input format for this script.
-#
-
-if ($ARGV[0] eq "-j") {
- $javaOutput = 1;
- shift @ARGV;
-}
-
-
-$num_states = 1; # Always the state number for the line being compiled.
-$line_num = 0; # The line number in the input file.
-
-$states{"pop"} = 255; # Add the "pop" to the list of defined state names.
- # This prevents any state from being labelled with "pop",
- # and resolves references to "pop" in the next state field.
-
-line_loop: while (<>) {
- chomp();
- $line = $_;
- @fields = split();
- $line_num++;
-
- # Remove # comments, which are any fields beginning with a #, plus all
- # that follow on the line.
- for ($i=0; $i<@fields; $i++) {
- if ($fields[$i] =~ /^#/) {
- @fields = @fields[0 .. $i-1];
- last;
- }
- }
- # ignore blank lines, and those with no fields left after stripping comments..
- if (@fields == 0) {
- next;
- }
-
- #
- # State Label: handling.
- # Does the first token end with a ":"? If so, it's the name of a state.
- # Put in a hash, together with the current state number,
- # so that we can later look up the number from the name.
- #
- if (@fields[0] =~ /.*:$/) {
- $state_name = @fields[0];
- $state_name =~ s/://; # strip off the colon from the state name.
-
- if ($states{$state_name} != 0) {
- print " rbbicst: at line $line-num duplicate definition of state $state_name\n";
- }
- $states{$state_name} = $num_states;
- $stateNames[$num_states] = $state_name;
-
- # if the label was the only thing on this line, go on to the next line,
- # otherwise assume that a state definition is on the same line and fall through.
- if (@fields == 1) {
- next line_loop;
- }
- shift @fields; # shift off label field in preparation
- # for handling the rest of the line.
- }
-
- #
- # State Transition line.
- # syntax is this,
- # character [n] target-state [^push-state] [function-name]
- # where
- # [something] is an optional something
- # character is either a single quoted character e.g. '['
- # or a name of a character class, e.g. white_space
- #
-
- $state_line_num[$num_states] = $line_num; # remember line number with each state
- # so we can make better error messages later.
- #
- # First field, character class or literal character for this transition.
- #
- if ($fields[0] =~ /^'.'$/) {
- # We've got a quoted literal character.
- $state_literal_chars[$num_states] = $fields[0];
- $state_literal_chars[$num_states] =~ s/'//g;
- } else {
- # We've got the name of a character class.
- $state_char_class[$num_states] = $fields[0];
- if ($fields[0] =~ /[\W]/) {
- print " rbbicsts: at line $line_num, bad character literal or character class name.\n";
- print " scanning $fields[0]\n";
- exit(-1);
- }
- }
- shift @fields;
-
- #
- # do the 'n' flag
- #
- $state_flag[$num_states] = "FALSE";
- if ($fields[0] eq "n") {
- $state_flag[$num_states] = "TRUE";
- shift @fields;
- }
-
- #
- # do the destination state.
- #
- $state_dest_state[$num_states] = $fields[0];
- if ($fields[0] eq "") {
- print " rbbicsts: at line $line_num, destination state missing.\n";
- exit(-1);
- }
- shift @fields;
-
- #
- # do the push state, if present.
- #
- if ($fields[0] =~ /^\^/) {
- $fields[0] =~ s/^\^//;
- $state_push_state[$num_states] = $fields[0];
- if ($fields[0] eq "" ) {
- print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n";
- exit(-1);
- }
- shift @fields;
- }
-
- #
- # Lastly, do the optional action name.
- #
- if ($fields[0] ne "") {
- $state_func_name[$num_states] = $fields[0];
- shift @fields;
- }
-
- #
- # There should be no fields left on the line at this point.
- #
- if (@fields > 0) {
- print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n";
- print " scanning $fields[0]\n";
- }
- $num_states++;
-}
-
-#
-# We've read in the whole file, now go back and output the
-# C source code for the state transition table.
-#
-# We read all states first, before writing anything, so that the state numbers
-# for the destination states are all available to be written.
-#
-
-#
-# Make hashes for the names of the character classes and
-# for the names of the actions that appeared.
-#
-for ($state=1; $state < $num_states; $state++) {
- if ($state_char_class[$state] ne "") {
- if ($charClasses{$state_char_class[$state]} == 0) {
- $charClasses{$state_char_class[$state]} = 1;
- }
- }
- if ($state_func_name[$state] eq "") {
- $state_func_name[$state] = "doNOP";
- }
- if ($actions{$state_action_name[$state]} == 0) {
- $actions{$state_func_name[$state]} = 1;
- }
-}
-
-#
-# Check that all of the destination states have been defined
-#
-#
-$states{"exit"} = 0; # Predefined state name, terminates state machine.
-for ($state=1; $state<$num_states; $state++) {
- if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
- print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
- $errors++;
- }
- if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
- print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
- $errors++;
- }
-}
-
-die if ($errors>0);
-
-my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime;
-$year += 1900;
-
-if ($javaOutput) {
- print "/*\n";
- print " *******************************************************************************\n";
- print " * Copyright (C) 2003-$year,\n";
- print " * International Business Machines Corporation and others. All Rights Reserved.\n";
- print " *******************************************************************************\n";
- print " */\n";
- print " \n";
- print "package com.ibm.icu.text;\n";
- print " \n";
- print "/**\n";
- print " * Generated Java File. Do not edit by hand.\n";
- print " * This file contains the state table for the ICU Rule Based Break Iterator\n";
- print " * rule parser.\n";
- print " * It is generated by the Perl script \"rbbicst.pl\" from\n";
- print " * the rule parser state definitions file \"rbbirpt.txt\".\n";
- print " *\n";
- print " */\n";
-
- print "public class RuleBasedBreakIteratorStateTable\n";
- print "{\n";
-
- #
- # Emit the constants for the actions to be performed.
- #
- $n = 1;
- foreach $act (keys %actions) {
- print " public static final int $act = $n;\n";
- $n++;
- }
- print " \n";
- #
- # emit the state transition table
- #
- print " public static final String[] gRuleParseStateTable = {\n";
- printf(" \"\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\\u%04.4x\"\n", doNOP, 0, 0, 0, 1);
- for ($state=1; $state < $num_states; $state++) {
- printf(" , \"\\u%04.4x", $state_func_name[$state]);
- # print " , {$state_func_name[$state],";
- if ($state_literal_chars[$state] ne "") {
- printf("\\u%04.4x", $state_func_name[$state]);
- }else {
- printf("\\u%04.4x", $charClasses{$state_char_class[$state]});
- }
- printf("\\u%04.4x", $states{$state_dest_state[$state]});
-
- # The push-state field is optional. If omitted, fill field with a zero, which flags
- # the state machine that there is no push state.
- if ($state_push_state[$state] eq "") {
- print "\\u0000";
- } else {
- printf("\\u%04.4x", $states{$state_push_state[$state]});
- }
- printf("\\u%04.4x", $state_flag[$state]);
-
- # For the first row of each state, append the state name.
- # Used for debugging only.
- if ($stateNames[$state] ne "") {
- printf("%-20s", $stateNames[$state]."\"");
- } else {
- printf("%-20s", "\"");
- }
-
- # Put out a C++ comment showing the number (index) of this state row,
- print " // $state ";
- print "\n";
- };
- print " };\n";
- print "}\n";
-}
-else
-{
- #
- # C++ Output ...
- #
-
-
- print "//---------------------------------------------------------------------------------\n";
- print "//\n";
- print "// Generated Header File. Do not edit by hand.\n";
- print "// This file contains the state table for the ICU Rule Based Break Iterator\n";
- print "// rule parser.\n";
- print "// It is generated by the Perl script \"rbbicst.pl\" from\n";
- print "// the rule parser state definitions file \"rbbirpt.txt\".\n";
- print "//\n";
- print "// Copyright (C) 2002-$year International Business Machines Corporation \n";
- print "// and others. All rights reserved. \n";
- print "//\n";
- print "//---------------------------------------------------------------------------------\n";
- print "#ifndef RBBIRPT_H\n";
- print "#define RBBIRPT_H\n";
- print "\n";
- print "U_NAMESPACE_BEGIN\n";
-
- #
- # Emit the constants for indicies of Unicode Sets
- # Define one constant for each of the character classes encountered.
- # At the same time, store the index corresponding to the set name back into hash.
- #
- print "//\n";
- print "// Character classes for RBBI rule scanning.\n";
- print "//\n";
- $i = 128; # State Table values for Unicode char sets range from 128-250.
- # Sets "default", "escaped", etc. get special handling.
- # They have no corresponding UnicodeSet object in the state machine,
- # but are handled by special case code. So we emit no reference
- # to a UnicodeSet object to them here.
- foreach $setName (keys %charClasses) {
- if ($setName eq "default") {
- $charClasses{$setName} = 255;}
- elsif ($setName eq "escaped") {
- $charClasses{$setName} = 254;}
- elsif ($setName eq "escapedP") {
- $charClasses{$setName} = 253;}
- elsif ($setName eq "eof") {
- $charClasses{$setName} = 252;}
- else {
- # Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
- print " static const uint8_t kRuleSet_$setName = $i;\n";
- $charClasses{$setName} = $i;
- $i++;
- }
- }
- print "\n\n";
-
- #
- # Emit the enum for the actions to be performed.
- #
- print "enum RBBI_RuleParseAction {\n";
- foreach $act (keys %actions) {
- print " $act,\n";
- }
- print " rbbiLastAction};\n\n";
-
- #
- # Emit the struct definition for transtion table elements.
- #
- print "//-------------------------------------------------------------------------------\n";
- print "//\n";
- print "// RBBIRuleTableEl represents the structure of a row in the transition table\n";
- print "// for the rule parser state machine.\n";
- print "//-------------------------------------------------------------------------------\n";
- print "struct RBBIRuleTableEl {\n";
- print " RBBI_RuleParseAction fAction;\n";
- print " uint8_t fCharClass; // 0-127: an individual ASCII character\n";
- print " // 128-255: character class index\n";
- print " uint8_t fNextState; // 0-250: normal next-stat numbers\n";
- print " // 255: pop next-state from stack.\n";
- print " uint8_t fPushState;\n";
- print " UBool fNextChar;\n";
- print "};\n\n";
-
- #
- # emit the state transition table
- #
- print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n";
- print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1.
- for ($state=1; $state < $num_states; $state++) {
- print " , {$state_func_name[$state],";
- if ($state_literal_chars[$state] ne "") {
- $c = $state_literal_chars[$state];
- printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok.
- }else {
- print " $charClasses{$state_char_class[$state]},";
- }
- print " $states{$state_dest_state[$state]},";
-
- # The push-state field is optional. If omitted, fill field with a zero, which flags
- # the state machine that there is no push state.
- if ($state_push_state[$state] eq "") {
- print "0, ";
- } else {
- print " $states{$state_push_state[$state]},";
- }
- print " $state_flag[$state]} ";
-
- # Put out a C++ comment showing the number (index) of this state row,
- # and, if this is the first row of the table for this state, the state name.
- print " // $state ";
- if ($stateNames[$state] ne "") {
- print " $stateNames[$state]";
- }
- print "\n";
- };
- print " };\n";
-
-
- #
- # emit a mapping array from state numbers to state names.
- #
- # This array is used for producing debugging output from the rule parser.
- #
- print "#ifdef RBBI_DEBUG\n";
- print "static const char * const RBBIRuleStateNames[] = {";
- for ($state=0; $state<$num_states; $state++) {
- if ($stateNames[$state] ne "") {
- print " \"$stateNames[$state]\",\n";
- } else {
- print " 0,\n";
- }
- }
- print " 0};\n";
- print "#endif\n\n";
-
- print "U_NAMESPACE_END\n";
- print "#endif\n";
-}
-
-
-
+#**************************************************************************\r
+# Copyright (C) 2002-2005 International Business Machines Corporation *\r
+# and others. All rights reserved. *\r
+#**************************************************************************\r
+#\r
+# rbbicst Compile the RBBI rule paser state table data into initialized C data.\r
+# Usage:\r
+# cd icu/source/common\r
+# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h\r
+# perl rbbicst.pl -j < rbbirpt.txt > RBBIRuleParseTable.java\r
+#\r
+# The output file, rbbrpt.h, is included by some of the .cpp rbbi\r
+# implementation files. This perl script is NOT run as part\r
+# of a normal ICU build. It is run by hand when needed, and the\r
+# rbbirpt.h generated file is put back into cvs.\r
+#\r
+# See rbbirpt.txt for a description of the input format for this script.\r
+#\r
+\r
+if ($ARGV[0] eq "-j") {\r
+ $javaOutput = 1;\r
+ shift @ARGV;\r
+}\r
+\r
+\r
+$num_states = 1; # Always the state number for the line being compiled.\r
+$line_num = 0; # The line number in the input file.\r
+\r
+$states{"pop"} = 255; # Add the "pop" to the list of defined state names.\r
+ # This prevents any state from being labelled with "pop",\r
+ # and resolves references to "pop" in the next state field.\r
+\r
+line_loop: while (<>) {\r
+ chomp();\r
+ $line = $_;\r
+ @fields = split();\r
+ $line_num++;\r
+\r
+ # Remove # comments, which are any fields beginning with a #, plus all\r
+ # that follow on the line.\r
+ for ($i=0; $i<@fields; $i++) {\r
+ if ($fields[$i] =~ /^#/) {\r
+ @fields = @fields[0 .. $i-1];\r
+ last;\r
+ }\r
+ }\r
+ # ignore blank lines, and those with no fields left after stripping comments..\r
+ if (@fields == 0) {\r
+ next;\r
+ }\r
+\r
+ #\r
+ # State Label: handling.\r
+ # Does the first token end with a ":"? If so, it's the name of a state.\r
+ # Put in a hash, together with the current state number,\r
+ # so that we can later look up the number from the name.\r
+ #\r
+ if (@fields[0] =~ /.*:$/) {\r
+ $state_name = @fields[0];\r
+ $state_name =~ s/://; # strip off the colon from the state name.\r
+\r
+ if ($states{$state_name} != 0) {\r
+ print " rbbicst: at line $line-num duplicate definition of state $state_name\n";\r
+ }\r
+ $states{$state_name} = $num_states;\r
+ $stateNames[$num_states] = $state_name;\r
+\r
+ # if the label was the only thing on this line, go on to the next line,\r
+ # otherwise assume that a state definition is on the same line and fall through.\r
+ if (@fields == 1) {\r
+ next line_loop;\r
+ }\r
+ shift @fields; # shift off label field in preparation\r
+ # for handling the rest of the line.\r
+ }\r
+\r
+ #\r
+ # State Transition line.\r
+ # syntax is this,\r
+ # character [n] target-state [^push-state] [function-name]\r
+ # where\r
+ # [something] is an optional something\r
+ # character is either a single quoted character e.g. '['\r
+ # or a name of a character class, e.g. white_space\r
+ #\r
+\r
+ $state_line_num[$num_states] = $line_num; # remember line number with each state\r
+ # so we can make better error messages later.\r
+ #\r
+ # First field, character class or literal character for this transition.\r
+ #\r
+ if ($fields[0] =~ /^'.'$/) {\r
+ # We've got a quoted literal character.\r
+ $state_literal_chars[$num_states] = $fields[0];\r
+ $state_literal_chars[$num_states] =~ s/'//g;\r
+ } else {\r
+ # We've got the name of a character class.\r
+ $state_char_class[$num_states] = $fields[0];\r
+ if ($fields[0] =~ /[\W]/) {\r
+ print " rbbicsts: at line $line_num, bad character literal or character class name.\n";\r
+ print " scanning $fields[0]\n";\r
+ exit(-1);\r
+ }\r
+ }\r
+ shift @fields;\r
+\r
+ #\r
+ # do the 'n' flag\r
+ #\r
+ $state_flag[$num_states] = $javaOutput? "false" : "FALSE";\r
+ if ($fields[0] eq "n") {\r
+ $state_flag[$num_states] = $javaOutput? "true": "TRUE";\r
+ shift @fields;\r
+ }\r
+\r
+ #\r
+ # do the destination state.\r
+ #\r
+ $state_dest_state[$num_states] = $fields[0];\r
+ if ($fields[0] eq "") {\r
+ print " rbbicsts: at line $line_num, destination state missing.\n";\r
+ exit(-1);\r
+ }\r
+ shift @fields;\r
+\r
+ #\r
+ # do the push state, if present.\r
+ #\r
+ if ($fields[0] =~ /^\^/) {\r
+ $fields[0] =~ s/^\^//;\r
+ $state_push_state[$num_states] = $fields[0];\r
+ if ($fields[0] eq "" ) {\r
+ print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n";\r
+ exit(-1);\r
+ }\r
+ shift @fields;\r
+ }\r
+\r
+ #\r
+ # Lastly, do the optional action name.\r
+ #\r
+ if ($fields[0] ne "") {\r
+ $state_func_name[$num_states] = $fields[0];\r
+ shift @fields;\r
+ }\r
+\r
+ #\r
+ # There should be no fields left on the line at this point.\r
+ #\r
+ if (@fields > 0) {\r
+ print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n";\r
+ print " scanning $fields[0]\n";\r
+ }\r
+ $num_states++;\r
+}\r
+\r
+#\r
+# We've read in the whole file, now go back and output the\r
+# C source code for the state transition table.\r
+#\r
+# We read all states first, before writing anything, so that the state numbers\r
+# for the destination states are all available to be written.\r
+#\r
+\r
+#\r
+# Make hashes for the names of the character classes and\r
+# for the names of the actions that appeared.\r
+#\r
+for ($state=1; $state < $num_states; $state++) {\r
+ if ($state_char_class[$state] ne "") {\r
+ if ($charClasses{$state_char_class[$state]} == 0) {\r
+ $charClasses{$state_char_class[$state]} = 1;\r
+ }\r
+ }\r
+ if ($state_func_name[$state] eq "") {\r
+ $state_func_name[$state] = "doNOP";\r
+ }\r
+ if ($actions{$state_action_name[$state]} == 0) {\r
+ $actions{$state_func_name[$state]} = 1;\r
+ }\r
+}\r
+\r
+#\r
+# Check that all of the destination states have been defined\r
+#\r
+#\r
+$states{"exit"} = 0; # Predefined state name, terminates state machine.\r
+for ($state=1; $state<$num_states; $state++) {\r
+ if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {\r
+ print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";\r
+ $errors++;\r
+ }\r
+ if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {\r
+ print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";\r
+ $errors++;\r
+ }\r
+}\r
+\r
+die if ($errors>0);\r
+\r
+#\r
+# Assign numbers to each of the character classes classes used.\r
+# Sets are numbered from 128 - 250\r
+# The values 0-127 in the state table are used for matching\r
+# individual ASCII characters (the only thing that can appear in the rules.)\r
+# The "set" names appearing in the code below (default, etc.) need special\r
+# handling because they do not correspond to a normal set of characters,\r
+# but trigger special handling by code in the state machine.\r
+#\r
+$i = 128;\r
+foreach $setName (sort keys %charClasses) {\r
+ if ($setName eq "default") {\r
+ $charClasses{$setName} = 255;}\r
+ elsif ($setName eq "escaped") {\r
+ $charClasses{$setName} = 254;}\r
+ elsif ($setName eq "escapedP") {\r
+ $charClasses{$setName} = 253;}\r
+ elsif ($setName eq "eof") {\r
+ $charClasses{$setName} = 252;}\r
+ else {\r
+ # Normal (single) character class. Number them.\r
+ $charClasses{$setName} = $i;\r
+ $i++;\r
+ }\r
+}\r
+\r
+\r
+my ($sec, $min, $hour, , $day, $mon, $year, $wday, $yday, $isdst) = localtime;\r
+$year += 1900;\r
+\r
+if ($javaOutput) {\r
+ print "/*\n";\r
+ print " *******************************************************************************\n";\r
+ print " * Copyright (C) 2003-$year,\n";\r
+ print " * International Business Machines Corporation and others. All Rights Reserved.\n";\r
+ print " *******************************************************************************\n";\r
+ print " */\n";\r
+ print " \n";\r
+ print "package com.ibm.icu.text;\n";\r
+ print " \n";\r
+ print "/**\n";\r
+ print " * Generated Java File. Do not edit by hand.\n";\r
+ print " * This file contains the state table for the ICU Rule Based Break Iterator\n";\r
+ print " * rule parser.\n";\r
+ print " * It is generated by the Perl script \"rbbicst.pl\" from\n";\r
+ print " * the rule parser state definitions file \"rbbirpt.txt\".\n";\r
+ print " * \@internal \n";\r
+ print " *\n";\r
+ print " */\n";\r
+\r
+ print "class RBBIRuleParseTable\n";\r
+ print "{\n";\r
+\r
+ #\r
+ # Emit the constants for the actions to be performed.\r
+ #\r
+ $n = 1;\r
+ foreach $act (sort keys %actions) {\r
+ print " static final short $act = $n;\n";\r
+ $n++;\r
+ }\r
+ print " \n";\r
+ \r
+ #\r
+ # Emit constants for char class names\r
+ #\r
+ foreach $setName (sort keys %charClasses) {\r
+ print " static final short kRuleSet_$setName = $charClasses{$setName};\n";\r
+ }\r
+ print "\n\n";\r
+ \r
+ \r
+ print " static class RBBIRuleTableElement { \n";\r
+ print " short fAction; \n";\r
+ print " short fCharClass; \n";\r
+ print " short fNextState; \n";\r
+ print " short fPushState; \n";\r
+ print " boolean fNextChar; \n";\r
+ print " String fStateName; \n";\r
+ print " RBBIRuleTableElement(short a, int cc, int ns, int ps, boolean nc, String sn) { \n";\r
+ print " fAction = a; \n";\r
+ print " fCharClass = (short)cc; \n";\r
+ print " fNextState = (short)ns; \n";\r
+ print " fPushState = (short)ps; \n";\r
+ print " fNextChar = nc; \n";\r
+ print " fStateName = sn; \n";\r
+ print " } \n";\r
+ print " }; \n";\r
+ print " \n";\r
+ \r
+ \r
+ print " static RBBIRuleTableElement[] gRuleParseStateTable = { \n ";\r
+ print " new RBBIRuleTableElement(doNOP, 0, 0,0, true, null ) // 0 \n"; #output the unused state 0. \r
+ for ($state=1; $state < $num_states; $state++) {\r
+ print " , new RBBIRuleTableElement($state_func_name[$state],";\r
+ if ($state_literal_chars[$state] ne "") {\r
+ $c = $state_literal_chars[$state];\r
+ print("'$c', "); \r
+ }else {\r
+ print " $charClasses{$state_char_class[$state]},";\r
+ }\r
+ print " $states{$state_dest_state[$state]},";\r
+ \r
+ # The push-state field is optional. If omitted, fill field with a zero, which flags\r
+ # the state machine that there is no push state.\r
+ if ($state_push_state[$state] eq "") {\r
+ print "0, ";\r
+ } else {\r
+ print " $states{$state_push_state[$state]},";\r
+ }\r
+ print " $state_flag[$state], ";\r
+ \r
+ # if this is the first row of the table for this state, put out the state name.\r
+ if ($stateNames[$state] ne "") {\r
+ print " \"$stateNames[$state]\") ";\r
+ } else {\r
+ print " null ) ";\r
+ }\r
+ \r
+ # Put out a comment showing the number (index) of this state row,\r
+ print " // $state ";\r
+ print "\n";\r
+ }\r
+ print " };\n";\r
+\r
+ print "}; \n";\r
+ \r
+}\r
+else\r
+{\r
+ #\r
+ # C++ Output ...\r
+ #\r
+\r
+\r
+ print "//---------------------------------------------------------------------------------\n";\r
+ print "//\n";\r
+ print "// Generated Header File. Do not edit by hand.\n";\r
+ print "// This file contains the state table for the ICU Rule Based Break Iterator\n";\r
+ print "// rule parser.\n";\r
+ print "// It is generated by the Perl script \"rbbicst.pl\" from\n";\r
+ print "// the rule parser state definitions file \"rbbirpt.txt\".\n";\r
+ print "//\n";\r
+ print "// Copyright (C) 2002-$year International Business Machines Corporation \n";\r
+ print "// and others. All rights reserved. \n";\r
+ print "//\n";\r
+ print "//---------------------------------------------------------------------------------\n";\r
+ print "#ifndef RBBIRPT_H\n";\r
+ print "#define RBBIRPT_H\n";\r
+ print "\n";\r
+ print "U_NAMESPACE_BEGIN\n";\r
+\r
+ #\r
+ # Emit the constants for indicies of Unicode Sets\r
+ # Define one constant for each of the character classes encountered.\r
+ # At the same time, store the index corresponding to the set name back into hash.\r
+ #\r
+ print "//\n";\r
+ print "// Character classes for RBBI rule scanning.\n";\r
+ print "//\n";\r
+ foreach $setName (sort keys %charClasses) {\r
+ if ($charClasses{$setName} < 250) {\r
+ # Normal character class.\r
+ print " static const uint8_t kRuleSet_$setName = $charClasses{$setName};\n";\r
+ }\r
+ }\r
+ print "\n\n";\r
+\r
+ #\r
+ # Emit the enum for the actions to be performed.\r
+ #\r
+ print "enum RBBI_RuleParseAction {\n";\r
+ foreach $act (sort keys %actions) {\r
+ print " $act,\n";\r
+ }\r
+ print " rbbiLastAction};\n\n";\r
+\r
+ #\r
+ # Emit the struct definition for transtion table elements.\r
+ #\r
+ print "//-------------------------------------------------------------------------------\n";\r
+ print "//\n";\r
+ print "// RBBIRuleTableEl represents the structure of a row in the transition table\n";\r
+ print "// for the rule parser state machine.\n";\r
+ print "//-------------------------------------------------------------------------------\n";\r
+ print "struct RBBIRuleTableEl {\n";\r
+ print " RBBI_RuleParseAction fAction;\n";\r
+ print " uint8_t fCharClass; // 0-127: an individual ASCII character\n";\r
+ print " // 128-255: character class index\n";\r
+ print " uint8_t fNextState; // 0-250: normal next-stat numbers\n";\r
+ print " // 255: pop next-state from stack.\n";\r
+ print " uint8_t fPushState;\n";\r
+ print " UBool fNextChar;\n";\r
+ print "};\n\n";\r
+\r
+ #\r
+ # emit the state transition table\r
+ #\r
+ print "static const struct RBBIRuleTableEl gRuleParseStateTable[] = {\n";\r
+ print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1.\r
+ for ($state=1; $state < $num_states; $state++) {\r
+ print " , {$state_func_name[$state],";\r
+ if ($state_literal_chars[$state] ne "") {\r
+ $c = $state_literal_chars[$state];\r
+ printf(" %d /* $c */,", ord($c)); # use numeric value, so EBCDIC machines are ok.\r
+ }else {\r
+ print " $charClasses{$state_char_class[$state]},";\r
+ }\r
+ print " $states{$state_dest_state[$state]},";\r
+\r
+ # The push-state field is optional. If omitted, fill field with a zero, which flags\r
+ # the state machine that there is no push state.\r
+ if ($state_push_state[$state] eq "") {\r
+ print "0, ";\r
+ } else {\r
+ print " $states{$state_push_state[$state]},";\r
+ }\r
+ print " $state_flag[$state]} ";\r
+\r
+ # Put out a C++ comment showing the number (index) of this state row,\r
+ # and, if this is the first row of the table for this state, the state name.\r
+ print " // $state ";\r
+ if ($stateNames[$state] ne "") {\r
+ print " $stateNames[$state]";\r
+ }\r
+ print "\n";\r
+ };\r
+ print " };\n";\r
+\r
+\r
+ #\r
+ # emit a mapping array from state numbers to state names.\r
+ #\r
+ # This array is used for producing debugging output from the rule parser.\r
+ #\r
+ print "#ifdef RBBI_DEBUG\n";\r
+ print "static const char * const RBBIRuleStateNames[] = {";\r
+ for ($state=0; $state<$num_states; $state++) {\r
+ if ($stateNames[$state] ne "") {\r
+ print " \"$stateNames[$state]\",\n";\r
+ } else {\r
+ print " 0,\n";\r
+ }\r
+ }\r
+ print " 0};\n";\r
+ print "#endif\n\n";\r
+\r
+ print "U_NAMESPACE_END\n";\r
+ print "#endif\n";\r
+}\r
+\r
+\r
+\r