]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | // |
2 | // regexcmp.h | |
3 | // | |
4 | // Copyright (C) 2002-2003, International Business Machines Corporation and others. | |
5 | // All Rights Reserved. | |
6 | // | |
7 | // This file contains declarations for the class RegexCompile | |
8 | // | |
9 | // This class is internal to the regular expression implementation. | |
10 | // For the public Regular Expression API, see the file "unicode/regex.h" | |
11 | // | |
12 | ||
13 | ||
14 | #ifndef RBBISCAN_H | |
15 | #define RBBISCAN_H | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
19 | ||
20 | #include "unicode/uobject.h" | |
21 | #include "unicode/uniset.h" | |
22 | #include "unicode/parseerr.h" | |
23 | #include "uhash.h" | |
24 | #include "uvector.h" | |
25 | ||
26 | ||
27 | ||
28 | U_NAMESPACE_BEGIN | |
29 | ||
30 | ||
31 | //-------------------------------------------------------------------------------- | |
32 | // | |
33 | // class RegexCompile Contains the regular expression compiler. | |
34 | // | |
35 | //-------------------------------------------------------------------------------- | |
36 | static const int kStackSize = 100; // The size of the state stack for | |
37 | // pattern parsing. Corresponds roughly | |
38 | // to the depth of parentheses nesting | |
39 | // that is allowed in the rules. | |
40 | ||
41 | enum EParseAction {dummy01, dummy02}; // Placeholder enum for the specifier for | |
42 | // actions that are specified in the | |
43 | // rule parsing state table. | |
44 | struct RegexTableEl; | |
45 | class RegexPattern; | |
46 | ||
47 | ||
48 | class RegexCompile : public UMemory { | |
49 | public: | |
50 | ||
51 | struct RegexPatternChar { | |
52 | UChar32 fChar; | |
53 | UBool fQuoted; | |
54 | }; | |
55 | ||
56 | RegexCompile(RegexPattern *rp, UErrorCode &e); | |
57 | ||
58 | void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); | |
59 | ||
60 | ||
61 | virtual ~RegexCompile(); | |
62 | ||
63 | void nextChar(RegexPatternChar &c); // Get the next char from the input stream. | |
64 | ||
65 | static void cleanup(); // Memory cleanup | |
66 | ||
67 | ||
68 | ||
69 | // Categories of parentheses in pattern. | |
70 | // The category is saved in the compile-time parentheses stack frame, and | |
71 | // determines the code to be generated when the matching close ) is encountered. | |
72 | enum EParenClass { | |
73 | plain = -1, // No special handling | |
74 | capturing = -2, | |
75 | atomic = -3, | |
76 | lookAhead = -4, | |
77 | negLookAhead = -5, | |
78 | flags = -6, | |
79 | lookBehind = -7, | |
80 | lookBehindN = -8 | |
81 | }; | |
82 | ||
83 | private: | |
84 | ||
85 | ||
86 | UBool doParseActions(EParseAction a); | |
87 | void error(UErrorCode e); // error reporting convenience function. | |
88 | ||
89 | UChar32 nextCharLL(); | |
90 | UChar32 peekCharLL(); | |
91 | UnicodeSet *scanSet(); | |
92 | UnicodeSet *scanProp(); | |
93 | void handleCloseParen(); | |
94 | int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern | |
95 | // at the top of the just completed block | |
96 | // or operation, and optionally ensure that | |
97 | // there is space to add an opcode there. | |
98 | void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for | |
99 | // a reference to a UnicodeSet. | |
100 | void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. | |
101 | int32_t LoopOp); | |
102 | UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier | |
103 | void literalChar(UChar32 c); // Compile a literal char | |
104 | void fixLiterals(UBool split=FALSE); // Fix literal strings. | |
105 | void insertOp(int32_t where); // Open up a slot for a new op in the | |
106 | // generated code at the specified location. | |
107 | void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code, | |
108 | // taking case mode into account. | |
109 | int32_t minMatchLength(int32_t start, | |
110 | int32_t end); | |
111 | int32_t maxMatchLength(int32_t start, | |
112 | int32_t end); | |
113 | void matchStartType(); | |
114 | void stripNOPs(); | |
115 | void OptDotStar(); | |
116 | ||
117 | ||
118 | UErrorCode *fStatus; | |
119 | RegexPattern *fRXPat; | |
120 | UParseError *fParseErr; | |
121 | ||
122 | // | |
123 | // Data associated with low level character scanning | |
124 | // | |
125 | int32_t fScanIndex; // Index of current character being processed | |
126 | // in the rule input string. | |
127 | int32_t fNextIndex; // Index of the next character, which | |
128 | // is the first character not yet scanned. | |
129 | UBool fQuoteMode; // Scan is in a \Q...\E quoted region | |
130 | UBool fInBackslashQuote; // Scan is between a '\' and the following char. | |
131 | UBool fEOLComments; // When scan is just after '(?', inhibit #... to | |
132 | // end of line comments, in favor of (?#...) comments. | |
133 | int fLineNum; // Line number in input file. | |
134 | int fCharNum; // Char position within the line. | |
135 | UChar32 fLastChar; // Previous char, needed to count CR-LF | |
136 | // as a single line, not two. | |
137 | UChar32 fPeekChar; // Saved char, if we've scanned ahead. | |
138 | ||
139 | ||
140 | RegexPatternChar fC; // Current char for parse state machine | |
141 | // processing. | |
142 | ||
143 | // | |
144 | // Data for the state machine that parses the regular expression. | |
145 | // | |
146 | RegexTableEl **fStateTable; // State Transition Table for regex Rule | |
147 | // parsing. index by p[state][char-class] | |
148 | ||
149 | uint16_t fStack[kStackSize]; // State stack, holds state pushes | |
150 | int fStackPtr; // and pops as specified in the state | |
151 | // transition rules. | |
152 | ||
153 | // | |
154 | // Data associated with the generation of the pcode for the match engine | |
155 | // | |
156 | int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) | |
157 | int32_t fNewModeFlags; // New flags, while compiling (?i, holds state | |
158 | // until last flag is scanned. | |
159 | UBool fSetModeFlag; // true for (?ismx, false for (?-ismx | |
160 | ||
161 | ||
162 | int32_t fStringOpStart; // While a literal string is being scanned | |
163 | // holds the start index within RegexPattern. | |
164 | // fLiteralText where the string is being stored. | |
165 | ||
166 | int32_t fPatternLength; // Length of the input pattern string. | |
167 | ||
168 | UVector32 fParenStack; // parentheses stack. Each frame consists of | |
169 | // the positions of compiled pattern operations | |
170 | // needing fixup, followed by negative value. The | |
171 | // first entry in each frame is the position of the | |
172 | // spot reserved for use when a quantifier | |
173 | // needs to add a SAVE at the start of a (block) | |
174 | // The negative value (-1, -2,...) indicates | |
175 | // the kind of paren that opened the frame. Some | |
176 | // need special handling on close. | |
177 | ||
178 | ||
179 | int32_t fMatchOpenParen; // The position in the compiled pattern | |
180 | // of the slot reserved for a state save | |
181 | // at the start of the most recently processed | |
182 | // parenthesized block. | |
183 | int32_t fMatchCloseParen; // The position in the pattern of the first | |
184 | // location after the most recently processed | |
185 | // parenthesized block. | |
186 | ||
187 | int32_t fIntervalLow; // {lower, upper} interval quantifier values. | |
188 | int32_t fIntervalUpper; // Placed here temporarily, when pattern is | |
189 | // initially scanned. Each new interval | |
190 | // encountered overwrites these values. | |
191 | // -1 for the upper interval value means none | |
192 | // was specified (unlimited occurences.) | |
193 | ||
194 | int32_t fNameStartPos; // Starting position of a \N{NAME} name in a | |
195 | // pattern, valid while remainder of name is | |
196 | // scanned. | |
197 | }; | |
198 | ||
199 | U_NAMESPACE_END | |
200 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | |
201 | #endif // RBBISCAN_H |