]>
Commit | Line | Data |
---|---|---|
ba379fdc A |
1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | #ifndef RegexInterpreter_h | |
27 | #define RegexInterpreter_h | |
28 | ||
29 | #include <wtf/Platform.h> | |
30 | ||
31 | #if ENABLE(YARR) | |
32 | ||
33 | #include <wtf/unicode/Unicode.h> | |
34 | #include "RegexParser.h" | |
35 | #include "RegexPattern.h" | |
36 | ||
37 | namespace JSC { namespace Yarr { | |
38 | ||
39 | class ByteDisjunction; | |
40 | ||
41 | struct ByteTerm { | |
42 | enum Type { | |
43 | TypeBodyAlternativeBegin, | |
44 | TypeBodyAlternativeDisjunction, | |
45 | TypeBodyAlternativeEnd, | |
46 | TypeAlternativeBegin, | |
47 | TypeAlternativeDisjunction, | |
48 | TypeAlternativeEnd, | |
49 | TypeSubpatternBegin, | |
50 | TypeSubpatternEnd, | |
51 | TypeAssertionBOL, | |
52 | TypeAssertionEOL, | |
53 | TypeAssertionWordBoundary, | |
54 | TypePatternCharacterOnce, | |
55 | TypePatternCharacterFixed, | |
56 | TypePatternCharacterGreedy, | |
57 | TypePatternCharacterNonGreedy, | |
58 | TypePatternCasedCharacterOnce, | |
59 | TypePatternCasedCharacterFixed, | |
60 | TypePatternCasedCharacterGreedy, | |
61 | TypePatternCasedCharacterNonGreedy, | |
62 | TypeCharacterClass, | |
63 | TypeBackReference, | |
64 | TypeParenthesesSubpattern, | |
65 | TypeParenthesesSubpatternOnceBegin, | |
66 | TypeParenthesesSubpatternOnceEnd, | |
67 | TypeParentheticalAssertionBegin, | |
68 | TypeParentheticalAssertionEnd, | |
69 | TypeCheckInput, | |
70 | } type; | |
71 | bool invertOrCapture; | |
72 | union { | |
73 | struct { | |
74 | union { | |
75 | UChar patternCharacter; | |
76 | struct { | |
77 | UChar lo; | |
78 | UChar hi; | |
79 | } casedCharacter; | |
80 | CharacterClass* characterClass; | |
81 | unsigned subpatternId; | |
82 | }; | |
83 | union { | |
84 | ByteDisjunction* parenthesesDisjunction; | |
85 | unsigned parenthesesWidth; | |
86 | }; | |
87 | QuantifierType quantityType; | |
88 | unsigned quantityCount; | |
89 | } atom; | |
90 | struct { | |
91 | int next; | |
92 | int end; | |
93 | } alternative; | |
94 | unsigned checkInputCount; | |
95 | }; | |
96 | unsigned frameLocation; | |
97 | int inputPosition; | |
98 | ||
99 | ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) | |
100 | : frameLocation(frameLocation) | |
101 | { | |
102 | switch (quantityType) { | |
103 | case QuantifierFixedCount: | |
104 | type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; | |
105 | break; | |
106 | case QuantifierGreedy: | |
107 | type = ByteTerm::TypePatternCharacterGreedy; | |
108 | break; | |
109 | case QuantifierNonGreedy: | |
110 | type = ByteTerm::TypePatternCharacterNonGreedy; | |
111 | break; | |
112 | } | |
113 | ||
114 | atom.patternCharacter = ch; | |
115 | atom.quantityType = quantityType; | |
116 | atom.quantityCount = quantityCount; | |
117 | inputPosition = inputPos; | |
118 | } | |
119 | ||
120 | ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) | |
121 | : frameLocation(frameLocation) | |
122 | { | |
123 | switch (quantityType) { | |
124 | case QuantifierFixedCount: | |
125 | type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; | |
126 | break; | |
127 | case QuantifierGreedy: | |
128 | type = ByteTerm::TypePatternCasedCharacterGreedy; | |
129 | break; | |
130 | case QuantifierNonGreedy: | |
131 | type = ByteTerm::TypePatternCasedCharacterNonGreedy; | |
132 | break; | |
133 | } | |
134 | ||
135 | atom.casedCharacter.lo = lo; | |
136 | atom.casedCharacter.hi = hi; | |
137 | atom.quantityType = quantityType; | |
138 | atom.quantityCount = quantityCount; | |
139 | inputPosition = inputPos; | |
140 | } | |
141 | ||
142 | ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) | |
143 | : type(ByteTerm::TypeCharacterClass) | |
144 | , invertOrCapture(invert) | |
145 | { | |
146 | atom.characterClass = characterClass; | |
147 | atom.quantityType = QuantifierFixedCount; | |
148 | atom.quantityCount = 1; | |
149 | inputPosition = inputPos; | |
150 | } | |
151 | ||
152 | ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos) | |
153 | : type(type) | |
154 | , invertOrCapture(invertOrCapture) | |
155 | { | |
156 | atom.subpatternId = subpatternId; | |
157 | atom.parenthesesDisjunction = parenthesesInfo; | |
158 | atom.quantityType = QuantifierFixedCount; | |
159 | atom.quantityCount = 1; | |
160 | inputPosition = inputPos; | |
161 | } | |
162 | ||
163 | ByteTerm(Type type, bool invert = false) | |
164 | : type(type) | |
165 | , invertOrCapture(invert) | |
166 | { | |
167 | atom.quantityType = QuantifierFixedCount; | |
168 | atom.quantityCount = 1; | |
169 | } | |
170 | ||
171 | ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos) | |
172 | : type(type) | |
173 | , invertOrCapture(invertOrCapture) | |
174 | { | |
175 | atom.subpatternId = subpatternId; | |
176 | atom.quantityType = QuantifierFixedCount; | |
177 | atom.quantityCount = 1; | |
178 | inputPosition = inputPos; | |
179 | } | |
180 | ||
181 | static ByteTerm BOL(int inputPos) | |
182 | { | |
183 | ByteTerm term(TypeAssertionBOL); | |
184 | term.inputPosition = inputPos; | |
185 | return term; | |
186 | } | |
187 | ||
188 | static ByteTerm CheckInput(unsigned count) | |
189 | { | |
190 | ByteTerm term(TypeCheckInput); | |
191 | term.checkInputCount = count; | |
192 | return term; | |
193 | } | |
194 | ||
195 | static ByteTerm EOL(int inputPos) | |
196 | { | |
197 | ByteTerm term(TypeAssertionEOL); | |
198 | term.inputPosition = inputPos; | |
199 | return term; | |
200 | } | |
201 | ||
202 | static ByteTerm WordBoundary(bool invert, int inputPos) | |
203 | { | |
204 | ByteTerm term(TypeAssertionWordBoundary, invert); | |
205 | term.inputPosition = inputPos; | |
206 | return term; | |
207 | } | |
208 | ||
209 | static ByteTerm BackReference(unsigned subpatternId, int inputPos) | |
210 | { | |
211 | return ByteTerm(TypeBackReference, subpatternId, false, inputPos); | |
212 | } | |
213 | ||
214 | static ByteTerm BodyAlternativeBegin() | |
215 | { | |
216 | ByteTerm term(TypeBodyAlternativeBegin); | |
217 | term.alternative.next = 0; | |
218 | term.alternative.end = 0; | |
219 | return term; | |
220 | } | |
221 | ||
222 | static ByteTerm BodyAlternativeDisjunction() | |
223 | { | |
224 | ByteTerm term(TypeBodyAlternativeDisjunction); | |
225 | term.alternative.next = 0; | |
226 | term.alternative.end = 0; | |
227 | return term; | |
228 | } | |
229 | ||
230 | static ByteTerm BodyAlternativeEnd() | |
231 | { | |
232 | ByteTerm term(TypeBodyAlternativeEnd); | |
233 | term.alternative.next = 0; | |
234 | term.alternative.end = 0; | |
235 | return term; | |
236 | } | |
237 | ||
238 | static ByteTerm AlternativeBegin() | |
239 | { | |
240 | ByteTerm term(TypeAlternativeBegin); | |
241 | term.alternative.next = 0; | |
242 | term.alternative.end = 0; | |
243 | return term; | |
244 | } | |
245 | ||
246 | static ByteTerm AlternativeDisjunction() | |
247 | { | |
248 | ByteTerm term(TypeAlternativeDisjunction); | |
249 | term.alternative.next = 0; | |
250 | term.alternative.end = 0; | |
251 | return term; | |
252 | } | |
253 | ||
254 | static ByteTerm AlternativeEnd() | |
255 | { | |
256 | ByteTerm term(TypeAlternativeEnd); | |
257 | term.alternative.next = 0; | |
258 | term.alternative.end = 0; | |
259 | return term; | |
260 | } | |
261 | ||
262 | static ByteTerm SubpatternBegin() | |
263 | { | |
264 | return ByteTerm(TypeSubpatternBegin); | |
265 | } | |
266 | ||
267 | static ByteTerm SubpatternEnd() | |
268 | { | |
269 | return ByteTerm(TypeSubpatternEnd); | |
270 | } | |
271 | ||
272 | bool invert() | |
273 | { | |
274 | return invertOrCapture; | |
275 | } | |
276 | ||
277 | bool capture() | |
278 | { | |
279 | return invertOrCapture; | |
280 | } | |
281 | }; | |
282 | ||
f9bf01c6 | 283 | class ByteDisjunction : public FastAllocBase { |
ba379fdc A |
284 | public: |
285 | ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) | |
286 | : m_numSubpatterns(numSubpatterns) | |
287 | , m_frameSize(frameSize) | |
288 | { | |
289 | } | |
290 | ||
291 | Vector<ByteTerm> terms; | |
292 | unsigned m_numSubpatterns; | |
293 | unsigned m_frameSize; | |
294 | }; | |
295 | ||
f9bf01c6 | 296 | struct BytecodePattern : FastAllocBase { |
ba379fdc A |
297 | BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern) |
298 | : m_body(body) | |
299 | , m_ignoreCase(pattern.m_ignoreCase) | |
300 | , m_multiline(pattern.m_multiline) | |
301 | { | |
302 | newlineCharacterClass = pattern.newlineCharacterClass(); | |
303 | wordcharCharacterClass = pattern.wordcharCharacterClass(); | |
304 | ||
305 | m_allParenthesesInfo.append(allParenthesesInfo); | |
306 | m_userCharacterClasses.append(pattern.m_userCharacterClasses); | |
307 | // 'Steal' the RegexPattern's CharacterClasses! We clear its | |
308 | // array, so that it won't delete them on destruction. We'll | |
309 | // take responsibility for that. | |
310 | pattern.m_userCharacterClasses.clear(); | |
311 | } | |
312 | ||
313 | ~BytecodePattern() | |
314 | { | |
315 | deleteAllValues(m_allParenthesesInfo); | |
316 | deleteAllValues(m_userCharacterClasses); | |
317 | } | |
318 | ||
319 | OwnPtr<ByteDisjunction> m_body; | |
320 | bool m_ignoreCase; | |
321 | bool m_multiline; | |
322 | ||
323 | CharacterClass* newlineCharacterClass; | |
324 | CharacterClass* wordcharCharacterClass; | |
325 | private: | |
326 | Vector<ByteDisjunction*> m_allParenthesesInfo; | |
327 | Vector<CharacterClass*> m_userCharacterClasses; | |
328 | }; | |
329 | ||
330 | BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); | |
331 | int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); | |
332 | ||
333 | } } // namespace JSC::Yarr | |
334 | ||
335 | #endif | |
336 | ||
337 | #endif // RegexInterpreter_h |