]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbiscan57.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
5 // Copyright (C) 2002-2016, International Business Machines Corporation and others.
6 // All Rights Reserved.
9 **********************************************************************
10 * Legacy version of RBBIRuleScanner from ICU 57,
11 * only for use by Apple RuleBasedTokenizer
12 **********************************************************************
19 #include "unicode/utypes.h"
20 #include "unicode/uobject.h"
21 #include "unicode/uniset.h"
22 #include "unicode/parseerr.h"
25 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
26 // looks up references to $variables within a set.
33 class RBBIRuleBuilder57
;
34 class RBBISymbolTable57
;
37 //--------------------------------------------------------------------------------
39 // class RBBIRuleScanner57 does the lowest level, character-at-a-time
40 // scanning of break iterator rules.
42 // The output of the scanner is parse trees for
43 // the rule expressions and a list of all Unicode Sets
46 //--------------------------------------------------------------------------------
48 class RBBIRuleScanner57
: public UMemory
{
52 kStackSize
= 100 // The size of the state stack for
53 }; // rules parsing. Corresponds roughly
54 // to the depth of parentheses nesting
55 // that is allowed in the rules.
60 RBBIRuleChar() : fChar(0), fEscaped(FALSE
) {};
63 RBBIRuleScanner57(RBBIRuleBuilder57
*rb
);
66 virtual ~RBBIRuleScanner57();
68 void nextChar(RBBIRuleChar
&c
); // Get the next char from the input stream.
69 // Return false if at end.
71 UBool
push(const RBBIRuleChar
&c
); // Push (unget) one character.
72 // Only a single character may be pushed.
74 void parse(); // Parse the rules, generating two parse
75 // trees, one each for the forward and
77 // and a list of UnicodeSets encountered.
80 * Return a rules string without unnecessary
83 static UnicodeString
stripRules(const UnicodeString
&rules
);
86 UBool
doParseActions(int32_t a
);
87 void error(UErrorCode e
); // error reporting convenience function.
88 void fixOpStack(RBBINode::OpPrecedence p
);
90 void findSetFor(const UnicodeString
&s
, RBBINode
*node
, UnicodeSet
*setToAdopt
= NULL
);
94 void printNodeStack(const char *title
);
96 RBBINode
*pushNewNode(RBBINode::NodeType t
);
100 RBBIRuleBuilder57
*fRB
; // The rule builder that we are part of.
102 int32_t fScanIndex
; // Index of current character being processed
103 // in the rule input string.
104 int32_t fNextIndex
; // Index of the next character, which
105 // is the first character not yet scanned.
106 UBool fQuoteMode
; // Scan is in a 'quoted region'
107 int32_t fLineNum
; // Line number in input file.
108 int32_t fCharNum
; // Char position within the line.
109 UChar32 fLastChar
; // Previous char, needed to count CR-LF
110 // as a single line, not two.
112 RBBIRuleChar fC
; // Current char for parse state machine
114 UnicodeString fVarName
; // $variableName, valid when we've just
117 RBBIRuleTableEl
**fStateTable
; // State Transition Table for RBBI Rule
118 // parsing. index by p[state][char-class]
120 uint16_t fStack
[kStackSize
]; // State stack, holds state pushes
121 int32_t fStackPtr
; // and pops as specified in the state
124 RBBINode
*fNodeStack
[kStackSize
]; // Node stack, holds nodes created
125 // during the parse of a rule
126 int32_t fNodeStackPtr
;
129 UBool fReverseRule
; // True if the rule currently being scanned
130 // is a reverse direction rule (if it
131 // starts with a '!')
133 UBool fLookAheadRule
; // True if the rule includes a '/'
134 // somewhere within it.
136 UBool fNoChainInRule
; // True if the current rule starts with a '^'.
138 RBBISymbolTable57
*fSymbolTable
; // symbol table, holds definitions of
139 // $variable symbols.
141 UHashtable
*fSetTable
; // UnicocodeSet hash table, holds indexes to
142 // the sets created while parsing rules.
143 // The key is the string used for creating
146 UnicodeSet fRuleSets
[10]; // Unicode Sets that are needed during
147 // the scanning of RBBI rules. The
148 // indicies for these are assigned by the
149 // perl script that builds the state tables.
152 int32_t fRuleNum
; // Counts each rule as it is scanned.
154 int32_t fOptionStart
; // Input index of start of a !!option
155 // keyword, while being scanned.
157 UnicodeSet
*gRuleSet_rule_char
;
158 UnicodeSet
*gRuleSet_white_space
;
159 UnicodeSet
*gRuleSet_name_char
;
160 UnicodeSet
*gRuleSet_name_start_char
;
162 RBBIRuleScanner57(const RBBIRuleScanner57
&other
); // forbid copying of this class
163 RBBIRuleScanner57
&operator=(const RBBIRuleScanner57
&other
); // forbid copying of this class