]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | // |
2 | // rbbiscan.h | |
3 | // | |
46f4442e | 4 | // Copyright (C) 2002-2008, International Business Machines Corporation and others. |
b75a7d8f A |
5 | // All Rights Reserved. |
6 | // | |
7 | // This file contains declarations for class RBBIRuleScanner | |
8 | // | |
9 | ||
10 | ||
11 | #ifndef RBBISCAN_H | |
12 | #define RBBISCAN_H | |
13 | ||
14 | #include "unicode/utypes.h" | |
15 | #include "unicode/uobject.h" | |
16 | #include "unicode/rbbi.h" | |
17 | #include "unicode/uniset.h" | |
18 | #include "unicode/parseerr.h" | |
19 | #include "uhash.h" | |
20 | #include "uvector.h" | |
374ca955 | 21 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
b75a7d8f A |
22 | // looks up references to $variables within a set. |
23 | #include "rbbinode.h" | |
24 | //#include "rbbitblb.h" | |
25 | ||
26 | ||
27 | ||
28 | U_NAMESPACE_BEGIN | |
29 | ||
30 | class RBBIRuleBuilder; | |
31 | class RBBISymbolTable; | |
32 | ||
33 | ||
34 | //-------------------------------------------------------------------------------- | |
35 | // | |
36 | // class RBBIRuleScanner does the lowest level, character-at-a-time | |
37 | // scanning of break iterator rules. | |
38 | // | |
39 | // The output of the scanner is parse trees for | |
40 | // the rule expressions and a list of all Unicode Sets | |
41 | // encountered. | |
42 | // | |
43 | //-------------------------------------------------------------------------------- | |
b75a7d8f A |
44 | |
45 | class RBBIRuleScanner : public UMemory { | |
46 | public: | |
47 | ||
46f4442e A |
48 | enum { |
49 | kStackSize = 100 // The size of the state stack for | |
50 | }; // rules parsing. Corresponds roughly | |
51 | // to the depth of parentheses nesting | |
52 | // that is allowed in the rules. | |
53 | ||
b75a7d8f A |
54 | struct RBBIRuleChar { |
55 | UChar32 fChar; | |
56 | UBool fEscaped; | |
57 | }; | |
58 | ||
59 | RBBIRuleScanner(RBBIRuleBuilder *rb); | |
60 | ||
61 | ||
62 | virtual ~RBBIRuleScanner(); | |
63 | ||
64 | void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. | |
65 | // Return false if at end. | |
66 | ||
67 | UBool push(const RBBIRuleChar &c); // Push (unget) one character. | |
68 | // Only a single character may be pushed. | |
69 | ||
70 | void parse(); // Parse the rules, generating two parse | |
71 | // trees, one each for the forward and | |
72 | // reverse rules, | |
73 | // and a list of UnicodeSets encountered. | |
74 | ||
75 | /** | |
76 | * Return a rules string without unnecessary | |
77 | * characters. | |
78 | */ | |
79 | static UnicodeString stripRules(const UnicodeString &rules); | |
80 | private: | |
81 | ||
46f4442e | 82 | UBool doParseActions(int32_t a); |
b75a7d8f A |
83 | void error(UErrorCode e); // error reporting convenience function. |
84 | void fixOpStack(RBBINode::OpPrecedence p); | |
85 | // a character. | |
86 | void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); | |
87 | ||
88 | UChar32 nextCharLL(); | |
73c04bcf | 89 | #ifdef RBBI_DEBUG |
b75a7d8f | 90 | void printNodeStack(const char *title); |
73c04bcf | 91 | #endif |
b75a7d8f A |
92 | RBBINode *pushNewNode(RBBINode::NodeType t); |
93 | void scanSet(); | |
94 | ||
95 | ||
96 | RBBIRuleBuilder *fRB; // The rule builder that we are part of. | |
97 | ||
98 | int32_t fScanIndex; // Index of current character being processed | |
99 | // in the rule input string. | |
100 | int32_t fNextIndex; // Index of the next character, which | |
101 | // is the first character not yet scanned. | |
102 | UBool fQuoteMode; // Scan is in a 'quoted region' | |
73c04bcf A |
103 | int32_t fLineNum; // Line number in input file. |
104 | int32_t fCharNum; // Char position within the line. | |
b75a7d8f A |
105 | UChar32 fLastChar; // Previous char, needed to count CR-LF |
106 | // as a single line, not two. | |
107 | ||
108 | RBBIRuleChar fC; // Current char for parse state machine | |
109 | // processing. | |
110 | UnicodeString fVarName; // $variableName, valid when we've just | |
111 | // scanned one. | |
112 | ||
113 | RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule | |
114 | // parsing. index by p[state][char-class] | |
115 | ||
116 | uint16_t fStack[kStackSize]; // State stack, holds state pushes | |
73c04bcf | 117 | int32_t fStackPtr; // and pops as specified in the state |
b75a7d8f A |
118 | // transition rules. |
119 | ||
120 | RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created | |
121 | // during the parse of a rule | |
73c04bcf | 122 | int32_t fNodeStackPtr; |
b75a7d8f A |
123 | |
124 | ||
125 | UBool fReverseRule; // True if the rule currently being scanned | |
126 | // is a reverse direction rule (if it | |
127 | // starts with a '!') | |
128 | ||
129 | UBool fLookAheadRule; // True if the rule includes a '/' | |
130 | // somewhere within it. | |
131 | ||
132 | RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of | |
133 | // $variable symbols. | |
134 | ||
135 | UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to | |
136 | // the sets created while parsing rules. | |
137 | // The key is the string used for creating | |
138 | // the set. | |
139 | ||
46f4442e | 140 | UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during |
b75a7d8f A |
141 | // the scanning of RBBI rules. The |
142 | // indicies for these are assigned by the | |
143 | // perl script that builds the state tables. | |
144 | // See rbbirpt.h. | |
145 | ||
146 | int32_t fRuleNum; // Counts each rule as it is scanned. | |
147 | ||
374ca955 A |
148 | int32_t fOptionStart; // Input index of start of a !!option |
149 | // keyword, while being scanned. | |
150 | ||
b75a7d8f A |
151 | UnicodeSet *gRuleSet_rule_char; |
152 | UnicodeSet *gRuleSet_white_space; | |
153 | UnicodeSet *gRuleSet_name_char; | |
154 | UnicodeSet *gRuleSet_name_start_char; | |
155 | ||
156 | RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class | |
157 | RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class | |
158 | }; | |
159 | ||
160 | U_NAMESPACE_END | |
161 | ||
162 | #endif |