]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | // |
4 | // rbbiscan.h | |
5 | // | |
2ca993e8 | 6 | // Copyright (C) 2002-2016, International Business Machines Corporation and others. |
b75a7d8f A |
7 | // All Rights Reserved. |
8 | // | |
9 | // This file contains declarations for class RBBIRuleScanner | |
10 | // | |
11 | ||
12 | ||
13 | #ifndef RBBISCAN_H | |
14 | #define RBBISCAN_H | |
15 | ||
16 | #include "unicode/utypes.h" | |
17 | #include "unicode/uobject.h" | |
18 | #include "unicode/rbbi.h" | |
19 | #include "unicode/uniset.h" | |
20 | #include "unicode/parseerr.h" | |
21 | #include "uhash.h" | |
22 | #include "uvector.h" | |
374ca955 | 23 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
b75a7d8f A |
24 | // looks up references to $variables within a set. |
25 | #include "rbbinode.h" | |
2ca993e8 | 26 | #include "rbbirpt.h" |
b75a7d8f A |
27 | |
28 | U_NAMESPACE_BEGIN | |
29 | ||
30 | class RBBIRuleBuilder; | |
31 | class RBBISymbolTable; | |
32 | ||
33 | ||
34 | //-------------------------------------------------------------------------------- | |
35 | // | |
36 | // class RBBIRuleScanner does the lowest level, character-at-a-time | |
37 | // scanning of break iterator rules. | |
38 | // | |
39 | // The output of the scanner is parse trees for | |
40 | // the rule expressions and a list of all Unicode Sets | |
41 | // encountered. | |
42 | // | |
43 | //-------------------------------------------------------------------------------- | |
b75a7d8f A |
44 | |
45 | class RBBIRuleScanner : public UMemory { | |
46 | public: | |
47 | ||
46f4442e A |
48 | enum { |
49 | kStackSize = 100 // The size of the state stack for | |
50 | }; // rules parsing. Corresponds roughly | |
51 | // to the depth of parentheses nesting | |
52 | // that is allowed in the rules. | |
53 | ||
b75a7d8f A |
54 | struct RBBIRuleChar { |
55 | UChar32 fChar; | |
56 | UBool fEscaped; | |
3d1f044b | 57 | RBBIRuleChar() : fChar(0), fEscaped(FALSE) {} |
b75a7d8f A |
58 | }; |
59 | ||
60 | RBBIRuleScanner(RBBIRuleBuilder *rb); | |
61 | ||
62 | ||
63 | virtual ~RBBIRuleScanner(); | |
64 | ||
65 | void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. | |
66 | // Return false if at end. | |
67 | ||
68 | UBool push(const RBBIRuleChar &c); // Push (unget) one character. | |
69 | // Only a single character may be pushed. | |
70 | ||
71 | void parse(); // Parse the rules, generating two parse | |
72 | // trees, one each for the forward and | |
73 | // reverse rules, | |
74 | // and a list of UnicodeSets encountered. | |
75 | ||
76 | /** | |
77 | * Return a rules string without unnecessary | |
78 | * characters. | |
79 | */ | |
80 | static UnicodeString stripRules(const UnicodeString &rules); | |
81 | private: | |
82 | ||
46f4442e | 83 | UBool doParseActions(int32_t a); |
b75a7d8f A |
84 | void error(UErrorCode e); // error reporting convenience function. |
85 | void fixOpStack(RBBINode::OpPrecedence p); | |
86 | // a character. | |
87 | void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); | |
88 | ||
89 | UChar32 nextCharLL(); | |
73c04bcf | 90 | #ifdef RBBI_DEBUG |
b75a7d8f | 91 | void printNodeStack(const char *title); |
73c04bcf | 92 | #endif |
b75a7d8f A |
93 | RBBINode *pushNewNode(RBBINode::NodeType t); |
94 | void scanSet(); | |
95 | ||
96 | ||
97 | RBBIRuleBuilder *fRB; // The rule builder that we are part of. | |
98 | ||
99 | int32_t fScanIndex; // Index of current character being processed | |
100 | // in the rule input string. | |
101 | int32_t fNextIndex; // Index of the next character, which | |
102 | // is the first character not yet scanned. | |
103 | UBool fQuoteMode; // Scan is in a 'quoted region' | |
73c04bcf A |
104 | int32_t fLineNum; // Line number in input file. |
105 | int32_t fCharNum; // Char position within the line. | |
b75a7d8f A |
106 | UChar32 fLastChar; // Previous char, needed to count CR-LF |
107 | // as a single line, not two. | |
108 | ||
109 | RBBIRuleChar fC; // Current char for parse state machine | |
110 | // processing. | |
111 | UnicodeString fVarName; // $variableName, valid when we've just | |
112 | // scanned one. | |
113 | ||
114 | RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule | |
115 | // parsing. index by p[state][char-class] | |
116 | ||
117 | uint16_t fStack[kStackSize]; // State stack, holds state pushes | |
73c04bcf | 118 | int32_t fStackPtr; // and pops as specified in the state |
b75a7d8f A |
119 | // transition rules. |
120 | ||
121 | RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created | |
122 | // during the parse of a rule | |
73c04bcf | 123 | int32_t fNodeStackPtr; |
b75a7d8f A |
124 | |
125 | ||
126 | UBool fReverseRule; // True if the rule currently being scanned | |
127 | // is a reverse direction rule (if it | |
128 | // starts with a '!') | |
129 | ||
130 | UBool fLookAheadRule; // True if the rule includes a '/' | |
131 | // somewhere within it. | |
132 | ||
2ca993e8 A |
133 | UBool fNoChainInRule; // True if the current rule starts with a '^'. |
134 | ||
b75a7d8f A |
135 | RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of |
136 | // $variable symbols. | |
137 | ||
138 | UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to | |
139 | // the sets created while parsing rules. | |
140 | // The key is the string used for creating | |
141 | // the set. | |
142 | ||
46f4442e | 143 | UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during |
b75a7d8f A |
144 | // the scanning of RBBI rules. The |
145 | // indicies for these are assigned by the | |
146 | // perl script that builds the state tables. | |
147 | // See rbbirpt.h. | |
148 | ||
149 | int32_t fRuleNum; // Counts each rule as it is scanned. | |
150 | ||
374ca955 A |
151 | int32_t fOptionStart; // Input index of start of a !!option |
152 | // keyword, while being scanned. | |
153 | ||
b75a7d8f A |
154 | UnicodeSet *gRuleSet_rule_char; |
155 | UnicodeSet *gRuleSet_white_space; | |
156 | UnicodeSet *gRuleSet_name_char; | |
157 | UnicodeSet *gRuleSet_name_start_char; | |
158 | ||
159 | RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class | |
160 | RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class | |
161 | }; | |
162 | ||
163 | U_NAMESPACE_END | |
164 | ||
165 | #endif |