]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | // | |
4 | // rbbiscan.h | |
5 | // | |
6 | // Copyright (C) 2002-2016, International Business Machines Corporation and others. | |
7 | // All Rights Reserved. | |
8 | // | |
9 | // This file contains declarations for class RBBIRuleScanner | |
10 | // | |
11 | ||
12 | ||
13 | #ifndef RBBISCAN_H | |
14 | #define RBBISCAN_H | |
15 | ||
16 | #include "unicode/utypes.h" | |
17 | #include "unicode/uobject.h" | |
18 | #include "unicode/rbbi.h" | |
19 | #include "unicode/uniset.h" | |
20 | #include "unicode/parseerr.h" | |
21 | #include "uhash.h" | |
22 | #include "uvector.h" | |
23 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that | |
24 | // looks up references to $variables within a set. | |
25 | #include "rbbinode.h" | |
26 | #include "rbbirpt.h" | |
27 | ||
28 | U_NAMESPACE_BEGIN | |
29 | ||
30 | class RBBIRuleBuilder; | |
31 | class RBBISymbolTable; | |
32 | ||
33 | ||
34 | //-------------------------------------------------------------------------------- | |
35 | // | |
36 | // class RBBIRuleScanner does the lowest level, character-at-a-time | |
37 | // scanning of break iterator rules. | |
38 | // | |
39 | // The output of the scanner is parse trees for | |
40 | // the rule expressions and a list of all Unicode Sets | |
41 | // encountered. | |
42 | // | |
43 | //-------------------------------------------------------------------------------- | |
44 | ||
45 | class RBBIRuleScanner : public UMemory { | |
46 | public: | |
47 | ||
48 | enum { | |
49 | kStackSize = 100 // The size of the state stack for | |
50 | }; // rules parsing. Corresponds roughly | |
51 | // to the depth of parentheses nesting | |
52 | // that is allowed in the rules. | |
53 | ||
54 | struct RBBIRuleChar { | |
55 | UChar32 fChar; | |
56 | UBool fEscaped; | |
57 | RBBIRuleChar() : fChar(0), fEscaped(FALSE) {} | |
58 | }; | |
59 | ||
60 | RBBIRuleScanner(RBBIRuleBuilder *rb); | |
61 | ||
62 | ||
63 | virtual ~RBBIRuleScanner(); | |
64 | ||
65 | void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. | |
66 | // Return false if at end. | |
67 | ||
68 | UBool push(const RBBIRuleChar &c); // Push (unget) one character. | |
69 | // Only a single character may be pushed. | |
70 | ||
71 | void parse(); // Parse the rules, generating two parse | |
72 | // trees, one each for the forward and | |
73 | // reverse rules, | |
74 | // and a list of UnicodeSets encountered. | |
75 | ||
76 | /** | |
77 | * Return a rules string without unnecessary | |
78 | * characters. | |
79 | */ | |
80 | static UnicodeString stripRules(const UnicodeString &rules); | |
81 | private: | |
82 | ||
83 | UBool doParseActions(int32_t a); | |
84 | void error(UErrorCode e); // error reporting convenience function. | |
85 | void fixOpStack(RBBINode::OpPrecedence p); | |
86 | // a character. | |
87 | void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); | |
88 | ||
89 | UChar32 nextCharLL(); | |
90 | #ifdef RBBI_DEBUG | |
91 | void printNodeStack(const char *title); | |
92 | #endif | |
93 | RBBINode *pushNewNode(RBBINode::NodeType t); | |
94 | void scanSet(); | |
95 | ||
96 | ||
97 | RBBIRuleBuilder *fRB; // The rule builder that we are part of. | |
98 | ||
99 | int32_t fScanIndex; // Index of current character being processed | |
100 | // in the rule input string. | |
101 | int32_t fNextIndex; // Index of the next character, which | |
102 | // is the first character not yet scanned. | |
103 | UBool fQuoteMode; // Scan is in a 'quoted region' | |
104 | int32_t fLineNum; // Line number in input file. | |
105 | int32_t fCharNum; // Char position within the line. | |
106 | UChar32 fLastChar; // Previous char, needed to count CR-LF | |
107 | // as a single line, not two. | |
108 | ||
109 | RBBIRuleChar fC; // Current char for parse state machine | |
110 | // processing. | |
111 | UnicodeString fVarName; // $variableName, valid when we've just | |
112 | // scanned one. | |
113 | ||
114 | RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule | |
115 | // parsing. index by p[state][char-class] | |
116 | ||
117 | uint16_t fStack[kStackSize]; // State stack, holds state pushes | |
118 | int32_t fStackPtr; // and pops as specified in the state | |
119 | // transition rules. | |
120 | ||
121 | RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created | |
122 | // during the parse of a rule | |
123 | int32_t fNodeStackPtr; | |
124 | ||
125 | ||
126 | UBool fReverseRule; // True if the rule currently being scanned | |
127 | // is a reverse direction rule (if it | |
128 | // starts with a '!') | |
129 | ||
130 | UBool fLookAheadRule; // True if the rule includes a '/' | |
131 | // somewhere within it. | |
132 | ||
133 | UBool fNoChainInRule; // True if the current rule starts with a '^'. | |
134 | ||
135 | RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of | |
136 | // $variable symbols. | |
137 | ||
138 | UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to | |
139 | // the sets created while parsing rules. | |
140 | // The key is the string used for creating | |
141 | // the set. | |
142 | ||
143 | UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during | |
144 | // the scanning of RBBI rules. The | |
145 | // indicies for these are assigned by the | |
146 | // perl script that builds the state tables. | |
147 | // See rbbirpt.h. | |
148 | ||
149 | int32_t fRuleNum; // Counts each rule as it is scanned. | |
150 | ||
151 | int32_t fOptionStart; // Input index of start of a !!option | |
152 | // keyword, while being scanned. | |
153 | ||
154 | UnicodeSet *gRuleSet_rule_char; | |
155 | UnicodeSet *gRuleSet_white_space; | |
156 | UnicodeSet *gRuleSet_name_char; | |
157 | UnicodeSet *gRuleSet_name_start_char; | |
158 | ||
159 | RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class | |
160 | RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class | |
161 | }; | |
162 | ||
163 | U_NAMESPACE_END | |
164 | ||
165 | #endif |