]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | // |
2 | // regexcmp.h | |
3 | // | |
4388f060 | 4 | // Copyright (C) 2002-2012, International Business Machines Corporation and others. |
b75a7d8f A |
5 | // All Rights Reserved. |
6 | // | |
7 | // This file contains declarations for the class RegexCompile | |
8 | // | |
9 | // This class is internal to the regular expression implementation. | |
10 | // For the public Regular Expression API, see the file "unicode/regex.h" | |
11 | // | |
12 | ||
13 | ||
14 | #ifndef RBBISCAN_H | |
15 | #define RBBISCAN_H | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
19 | ||
20 | #include "unicode/uobject.h" | |
21 | #include "unicode/uniset.h" | |
22 | #include "unicode/parseerr.h" | |
23 | #include "uhash.h" | |
24 | #include "uvector.h" | |
25 | ||
26 | ||
27 | ||
28 | U_NAMESPACE_BEGIN | |
29 | ||
30 | ||
31 | //-------------------------------------------------------------------------------- | |
32 | // | |
33 | // class RegexCompile Contains the regular expression compiler. | |
34 | // | |
35 | //-------------------------------------------------------------------------------- | |
b75a7d8f A |
36 | struct RegexTableEl; |
37 | class RegexPattern; | |
38 | ||
39 | ||
40 | class RegexCompile : public UMemory { | |
41 | public: | |
42 | ||
46f4442e A |
43 | enum { |
44 | kStackSize = 100 // The size of the state stack for | |
45 | }; // pattern parsing. Corresponds roughly | |
46 | // to the depth of parentheses nesting | |
47 | // that is allowed in the rules. | |
48 | ||
b75a7d8f A |
49 | struct RegexPatternChar { |
50 | UChar32 fChar; | |
51 | UBool fQuoted; | |
52 | }; | |
53 | ||
54 | RegexCompile(RegexPattern *rp, UErrorCode &e); | |
46f4442e | 55 | |
b75a7d8f | 56 | void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); |
729e4ab9 A |
57 | void compile(UText *pat, UParseError &pp, UErrorCode &e); |
58 | ||
b75a7d8f A |
59 | |
60 | virtual ~RegexCompile(); | |
61 | ||
62 | void nextChar(RegexPatternChar &c); // Get the next char from the input stream. | |
63 | ||
64 | static void cleanup(); // Memory cleanup | |
65 | ||
66 | ||
67 | ||
68 | // Categories of parentheses in pattern. | |
69 | // The category is saved in the compile-time parentheses stack frame, and | |
70 | // determines the code to be generated when the matching close ) is encountered. | |
71 | enum EParenClass { | |
72 | plain = -1, // No special handling | |
46f4442e | 73 | capturing = -2, |
b75a7d8f A |
74 | atomic = -3, |
75 | lookAhead = -4, | |
76 | negLookAhead = -5, | |
77 | flags = -6, | |
78 | lookBehind = -7, | |
79 | lookBehindN = -8 | |
80 | }; | |
81 | ||
82 | private: | |
83 | ||
84 | ||
46f4442e | 85 | UBool doParseActions(int32_t a); |
b75a7d8f A |
86 | void error(UErrorCode e); // error reporting convenience function. |
87 | ||
88 | UChar32 nextCharLL(); | |
89 | UChar32 peekCharLL(); | |
b75a7d8f | 90 | UnicodeSet *scanProp(); |
46f4442e | 91 | UnicodeSet *scanPosixProp(); |
b75a7d8f A |
92 | void handleCloseParen(); |
93 | int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern | |
94 | // at the top of the just completed block | |
95 | // or operation, and optionally ensure that | |
96 | // there is space to add an opcode there. | |
97 | void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for | |
98 | // a reference to a UnicodeSet. | |
99 | void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. | |
100 | int32_t LoopOp); | |
101 | UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier | |
102 | void literalChar(UChar32 c); // Compile a literal char | |
4388f060 | 103 | void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters. |
b75a7d8f A |
104 | void insertOp(int32_t where); // Open up a slot for a new op in the |
105 | // generated code at the specified location. | |
b75a7d8f A |
106 | int32_t minMatchLength(int32_t start, |
107 | int32_t end); | |
108 | int32_t maxMatchLength(int32_t start, | |
109 | int32_t end); | |
110 | void matchStartType(); | |
111 | void stripNOPs(); | |
46f4442e A |
112 | |
113 | void setEval(int32_t op); | |
114 | void setPushOp(int32_t op); | |
115 | UChar32 scanNamedChar(); | |
116 | UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); | |
b75a7d8f A |
117 | |
118 | ||
119 | UErrorCode *fStatus; | |
120 | RegexPattern *fRXPat; | |
121 | UParseError *fParseErr; | |
122 | ||
123 | // | |
124 | // Data associated with low level character scanning | |
125 | // | |
729e4ab9 | 126 | int64_t fScanIndex; // Index of current character being processed |
b75a7d8f | 127 | // in the rule input string. |
b75a7d8f A |
128 | UBool fQuoteMode; // Scan is in a \Q...\E quoted region |
129 | UBool fInBackslashQuote; // Scan is between a '\' and the following char. | |
46f4442e | 130 | UBool fEOLComments; // When scan is just after '(?', inhibit #... to |
b75a7d8f | 131 | // end of line comments, in favor of (?#...) comments. |
729e4ab9 A |
132 | int64_t fLineNum; // Line number in input file. |
133 | int64_t fCharNum; // Char position within the line. | |
b75a7d8f A |
134 | UChar32 fLastChar; // Previous char, needed to count CR-LF |
135 | // as a single line, not two. | |
136 | UChar32 fPeekChar; // Saved char, if we've scanned ahead. | |
137 | ||
138 | ||
139 | RegexPatternChar fC; // Current char for parse state machine | |
140 | // processing. | |
141 | ||
142 | // | |
143 | // Data for the state machine that parses the regular expression. | |
144 | // | |
145 | RegexTableEl **fStateTable; // State Transition Table for regex Rule | |
146 | // parsing. index by p[state][char-class] | |
147 | ||
148 | uint16_t fStack[kStackSize]; // State stack, holds state pushes | |
73c04bcf | 149 | int32_t fStackPtr; // and pops as specified in the state |
b75a7d8f A |
150 | // transition rules. |
151 | ||
152 | // | |
153 | // Data associated with the generation of the pcode for the match engine | |
154 | // | |
155 | int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) | |
73c04bcf A |
156 | // Always has high bit (31) set so that flag values |
157 | // on the paren stack are distinguished from relocatable | |
158 | // pcode addresses. | |
b75a7d8f A |
159 | int32_t fNewModeFlags; // New flags, while compiling (?i, holds state |
160 | // until last flag is scanned. | |
161 | UBool fSetModeFlag; // true for (?ismx, false for (?-ismx | |
162 | ||
4388f060 A |
163 | UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. |
164 | // Once completed, meaning that some non-literal pattern | |
165 | // construct is encountered, the appropriate opcodes | |
166 | // to match the literal will be generated, and this | |
167 | // string will be cleared. | |
b75a7d8f | 168 | |
729e4ab9 A |
169 | int64_t fPatternLength; // Length of the input pattern string. |
170 | ||
b75a7d8f A |
171 | UVector32 fParenStack; // parentheses stack. Each frame consists of |
172 | // the positions of compiled pattern operations | |
46f4442e | 173 | // needing fixup, followed by negative value. The |
b75a7d8f A |
174 | // first entry in each frame is the position of the |
175 | // spot reserved for use when a quantifier | |
176 | // needs to add a SAVE at the start of a (block) | |
177 | // The negative value (-1, -2,...) indicates | |
178 | // the kind of paren that opened the frame. Some | |
179 | // need special handling on close. | |
180 | ||
181 | ||
182 | int32_t fMatchOpenParen; // The position in the compiled pattern | |
183 | // of the slot reserved for a state save | |
184 | // at the start of the most recently processed | |
185 | // parenthesized block. | |
186 | int32_t fMatchCloseParen; // The position in the pattern of the first | |
187 | // location after the most recently processed | |
188 | // parenthesized block. | |
189 | ||
190 | int32_t fIntervalLow; // {lower, upper} interval quantifier values. | |
191 | int32_t fIntervalUpper; // Placed here temporarily, when pattern is | |
192 | // initially scanned. Each new interval | |
193 | // encountered overwrites these values. | |
194 | // -1 for the upper interval value means none | |
195 | // was specified (unlimited occurences.) | |
196 | ||
729e4ab9 | 197 | int64_t fNameStartPos; // Starting position of a \N{NAME} name in a |
b75a7d8f A |
198 | // pattern, valid while remainder of name is |
199 | // scanned. | |
46f4442e A |
200 | |
201 | UStack fSetStack; // Stack of UnicodeSets, used while evaluating | |
202 | // (at compile time) set expressions within | |
203 | // the pattern. | |
204 | UStack fSetOpStack; // Stack of pending set operators (&&, --, union) | |
205 | ||
206 | UChar32 fLastSetLiteral; // The last single code point added to a set. | |
207 | // needed when "-y" is scanned, and we need | |
208 | // to turn "x-y" into a range. | |
b75a7d8f A |
209 | }; |
210 | ||
46f4442e A |
211 | // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] |
212 | // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. | |
213 | ||
214 | enum SetOperations { | |
215 | setStart = 0 << 16 | 1, | |
216 | setEnd = 1 << 16 | 2, | |
217 | setNegation = 2 << 16 | 3, | |
218 | setCaseClose = 2 << 16 | 9, | |
219 | setDifference2 = 3 << 16 | 4, // '--' set difference operator | |
220 | setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator | |
221 | setUnion = 4 << 16 | 6, // implicit union of adjacent items | |
222 | setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. | |
223 | setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. | |
224 | }; | |
225 | ||
b75a7d8f A |
226 | U_NAMESPACE_END |
227 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | |
228 | #endif // RBBISCAN_H |