]>
Commit | Line | Data |
---|---|---|
0f5d89e8 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | // | |
4 | // | |
5 | // Copyright (C) 2002-2016, International Business Machines Corporation and others. | |
6 | // All Rights Reserved. | |
7 | // | |
8 | /* | |
9 | ********************************************************************** | |
10 | * Legacy version of RBBIRuleScanner from ICU 57, | |
11 | * only for use by Apple RuleBasedTokenizer | |
12 | ********************************************************************** | |
13 | */ | |
14 | ||
15 | ||
16 | #ifndef RBBISCAN57_H | |
17 | #define RBBISCAN57_H | |
18 | ||
19 | #include "unicode/utypes.h" | |
20 | #include "unicode/uobject.h" | |
21 | #include "unicode/uniset.h" | |
22 | #include "unicode/parseerr.h" | |
23 | #include "uhash.h" | |
24 | #include "uvector.h" | |
25 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that | |
26 | // looks up references to $variables within a set. | |
27 | #include "rbbi57.h" | |
28 | #include "rbbinode.h" | |
29 | #include "rbbirpt.h" | |
30 | ||
31 | U_NAMESPACE_BEGIN | |
32 | ||
33 | class RBBIRuleBuilder57; | |
34 | class RBBISymbolTable57; | |
35 | ||
36 | ||
37 | //-------------------------------------------------------------------------------- | |
38 | // | |
39 | // class RBBIRuleScanner57 does the lowest level, character-at-a-time | |
40 | // scanning of break iterator rules. | |
41 | // | |
42 | // The output of the scanner is parse trees for | |
43 | // the rule expressions and a list of all Unicode Sets | |
44 | // encountered. | |
45 | // | |
46 | //-------------------------------------------------------------------------------- | |
47 | ||
48 | class RBBIRuleScanner57 : public UMemory { | |
49 | public: | |
50 | ||
51 | enum { | |
52 | kStackSize = 100 // The size of the state stack for | |
53 | }; // rules parsing. Corresponds roughly | |
54 | // to the depth of parentheses nesting | |
55 | // that is allowed in the rules. | |
56 | ||
57 | struct RBBIRuleChar { | |
58 | UChar32 fChar; | |
59 | UBool fEscaped; | |
60 | RBBIRuleChar() : fChar(0), fEscaped(FALSE) {}; | |
61 | }; | |
62 | ||
63 | RBBIRuleScanner57(RBBIRuleBuilder57 *rb); | |
64 | ||
65 | ||
66 | virtual ~RBBIRuleScanner57(); | |
67 | ||
68 | void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. | |
69 | // Return false if at end. | |
70 | ||
71 | UBool push(const RBBIRuleChar &c); // Push (unget) one character. | |
72 | // Only a single character may be pushed. | |
73 | ||
74 | void parse(); // Parse the rules, generating two parse | |
75 | // trees, one each for the forward and | |
76 | // reverse rules, | |
77 | // and a list of UnicodeSets encountered. | |
78 | ||
79 | /** | |
80 | * Return a rules string without unnecessary | |
81 | * characters. | |
82 | */ | |
83 | static UnicodeString stripRules(const UnicodeString &rules); | |
84 | private: | |
85 | ||
86 | UBool doParseActions(int32_t a); | |
87 | void error(UErrorCode e); // error reporting convenience function. | |
88 | void fixOpStack(RBBINode::OpPrecedence p); | |
89 | // a character. | |
90 | void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); | |
91 | ||
92 | UChar32 nextCharLL(); | |
93 | #ifdef RBBI_DEBUG | |
94 | void printNodeStack(const char *title); | |
95 | #endif | |
96 | RBBINode *pushNewNode(RBBINode::NodeType t); | |
97 | void scanSet(); | |
98 | ||
99 | ||
100 | RBBIRuleBuilder57 *fRB; // The rule builder that we are part of. | |
101 | ||
102 | int32_t fScanIndex; // Index of current character being processed | |
103 | // in the rule input string. | |
104 | int32_t fNextIndex; // Index of the next character, which | |
105 | // is the first character not yet scanned. | |
106 | UBool fQuoteMode; // Scan is in a 'quoted region' | |
107 | int32_t fLineNum; // Line number in input file. | |
108 | int32_t fCharNum; // Char position within the line. | |
109 | UChar32 fLastChar; // Previous char, needed to count CR-LF | |
110 | // as a single line, not two. | |
111 | ||
112 | RBBIRuleChar fC; // Current char for parse state machine | |
113 | // processing. | |
114 | UnicodeString fVarName; // $variableName, valid when we've just | |
115 | // scanned one. | |
116 | ||
117 | RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule | |
118 | // parsing. index by p[state][char-class] | |
119 | ||
120 | uint16_t fStack[kStackSize]; // State stack, holds state pushes | |
121 | int32_t fStackPtr; // and pops as specified in the state | |
122 | // transition rules. | |
123 | ||
124 | RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created | |
125 | // during the parse of a rule | |
126 | int32_t fNodeStackPtr; | |
127 | ||
128 | ||
129 | UBool fReverseRule; // True if the rule currently being scanned | |
130 | // is a reverse direction rule (if it | |
131 | // starts with a '!') | |
132 | ||
133 | UBool fLookAheadRule; // True if the rule includes a '/' | |
134 | // somewhere within it. | |
135 | ||
136 | UBool fNoChainInRule; // True if the current rule starts with a '^'. | |
137 | ||
138 | RBBISymbolTable57 *fSymbolTable; // symbol table, holds definitions of | |
139 | // $variable symbols. | |
140 | ||
141 | UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to | |
142 | // the sets created while parsing rules. | |
143 | // The key is the string used for creating | |
144 | // the set. | |
145 | ||
146 | UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during | |
147 | // the scanning of RBBI rules. The | |
148 | // indicies for these are assigned by the | |
149 | // perl script that builds the state tables. | |
150 | // See rbbirpt.h. | |
151 | ||
152 | int32_t fRuleNum; // Counts each rule as it is scanned. | |
153 | ||
154 | int32_t fOptionStart; // Input index of start of a !!option | |
155 | // keyword, while being scanned. | |
156 | ||
157 | UnicodeSet *gRuleSet_rule_char; | |
158 | UnicodeSet *gRuleSet_white_space; | |
159 | UnicodeSet *gRuleSet_name_char; | |
160 | UnicodeSet *gRuleSet_name_start_char; | |
161 | ||
162 | RBBIRuleScanner57(const RBBIRuleScanner57 &other); // forbid copying of this class | |
163 | RBBIRuleScanner57 &operator=(const RBBIRuleScanner57 &other); // forbid copying of this class | |
164 | }; | |
165 | ||
166 | U_NAMESPACE_END | |
167 | ||
168 | #endif |