]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | #ifndef YarrInterpreter_h | |
27 | #define YarrInterpreter_h | |
28 | ||
29 | #include "YarrPattern.h" | |
30 | #include <wtf/PassOwnPtr.h> | |
31 | #include <wtf/unicode/Unicode.h> | |
32 | ||
33 | namespace WTF { | |
34 | class BumpPointerAllocator; | |
35 | } | |
36 | using WTF::BumpPointerAllocator; | |
37 | ||
38 | namespace JSC { namespace Yarr { | |
39 | ||
40 | class ByteDisjunction; | |
41 | ||
42 | struct ByteTerm { | |
43 | enum Type { | |
44 | TypeBodyAlternativeBegin, | |
45 | TypeBodyAlternativeDisjunction, | |
46 | TypeBodyAlternativeEnd, | |
47 | TypeAlternativeBegin, | |
48 | TypeAlternativeDisjunction, | |
49 | TypeAlternativeEnd, | |
50 | TypeSubpatternBegin, | |
51 | TypeSubpatternEnd, | |
52 | TypeAssertionBOL, | |
53 | TypeAssertionEOL, | |
54 | TypeAssertionWordBoundary, | |
55 | TypePatternCharacterOnce, | |
56 | TypePatternCharacterFixed, | |
57 | TypePatternCharacterGreedy, | |
58 | TypePatternCharacterNonGreedy, | |
59 | TypePatternCasedCharacterOnce, | |
60 | TypePatternCasedCharacterFixed, | |
61 | TypePatternCasedCharacterGreedy, | |
62 | TypePatternCasedCharacterNonGreedy, | |
63 | TypeCharacterClass, | |
64 | TypeBackReference, | |
65 | TypeParenthesesSubpattern, | |
66 | TypeParenthesesSubpatternOnceBegin, | |
67 | TypeParenthesesSubpatternOnceEnd, | |
68 | TypeParenthesesSubpatternTerminalBegin, | |
69 | TypeParenthesesSubpatternTerminalEnd, | |
70 | TypeParentheticalAssertionBegin, | |
71 | TypeParentheticalAssertionEnd, | |
72 | TypeCheckInput, | |
73 | TypeUncheckInput, | |
74 | TypeDotStarEnclosure, | |
75 | } type; | |
76 | union { | |
77 | struct { | |
78 | union { | |
79 | UChar patternCharacter; | |
80 | struct { | |
81 | UChar lo; | |
82 | UChar hi; | |
83 | } casedCharacter; | |
84 | CharacterClass* characterClass; | |
85 | unsigned subpatternId; | |
86 | }; | |
87 | union { | |
88 | ByteDisjunction* parenthesesDisjunction; | |
89 | unsigned parenthesesWidth; | |
90 | }; | |
91 | QuantifierType quantityType; | |
92 | unsigned quantityCount; | |
93 | } atom; | |
94 | struct { | |
95 | int next; | |
96 | int end; | |
97 | bool onceThrough; | |
98 | } alternative; | |
99 | struct { | |
100 | bool m_bol : 1; | |
101 | bool m_eol : 1; | |
102 | } anchors; | |
103 | unsigned checkInputCount; | |
104 | }; | |
105 | unsigned frameLocation; | |
106 | bool m_capture : 1; | |
107 | bool m_invert : 1; | |
108 | unsigned inputPosition; | |
109 | ||
110 | ByteTerm(UChar ch, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) | |
111 | : frameLocation(frameLocation) | |
112 | , m_capture(false) | |
113 | , m_invert(false) | |
114 | { | |
115 | switch (quantityType) { | |
116 | case QuantifierFixedCount: | |
117 | type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; | |
118 | break; | |
119 | case QuantifierGreedy: | |
120 | type = ByteTerm::TypePatternCharacterGreedy; | |
121 | break; | |
122 | case QuantifierNonGreedy: | |
123 | type = ByteTerm::TypePatternCharacterNonGreedy; | |
124 | break; | |
125 | } | |
126 | ||
127 | atom.patternCharacter = ch; | |
128 | atom.quantityType = quantityType; | |
129 | atom.quantityCount = quantityCount.unsafeGet(); | |
130 | inputPosition = inputPos; | |
131 | } | |
132 | ||
133 | ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) | |
134 | : frameLocation(frameLocation) | |
135 | , m_capture(false) | |
136 | , m_invert(false) | |
137 | { | |
138 | switch (quantityType) { | |
139 | case QuantifierFixedCount: | |
140 | type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; | |
141 | break; | |
142 | case QuantifierGreedy: | |
143 | type = ByteTerm::TypePatternCasedCharacterGreedy; | |
144 | break; | |
145 | case QuantifierNonGreedy: | |
146 | type = ByteTerm::TypePatternCasedCharacterNonGreedy; | |
147 | break; | |
148 | } | |
149 | ||
150 | atom.casedCharacter.lo = lo; | |
151 | atom.casedCharacter.hi = hi; | |
152 | atom.quantityType = quantityType; | |
153 | atom.quantityCount = quantityCount.unsafeGet(); | |
154 | inputPosition = inputPos; | |
155 | } | |
156 | ||
157 | ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) | |
158 | : type(ByteTerm::TypeCharacterClass) | |
159 | , m_capture(false) | |
160 | , m_invert(invert) | |
161 | { | |
162 | atom.characterClass = characterClass; | |
163 | atom.quantityType = QuantifierFixedCount; | |
164 | atom.quantityCount = 1; | |
165 | inputPosition = inputPos; | |
166 | } | |
167 | ||
168 | ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos) | |
169 | : type(type) | |
170 | , m_capture(capture) | |
171 | , m_invert(false) | |
172 | { | |
173 | atom.subpatternId = subpatternId; | |
174 | atom.parenthesesDisjunction = parenthesesInfo; | |
175 | atom.quantityType = QuantifierFixedCount; | |
176 | atom.quantityCount = 1; | |
177 | inputPosition = inputPos; | |
178 | } | |
179 | ||
180 | ByteTerm(Type type, bool invert = false) | |
181 | : type(type) | |
182 | , m_capture(false) | |
183 | , m_invert(invert) | |
184 | { | |
185 | atom.quantityType = QuantifierFixedCount; | |
186 | atom.quantityCount = 1; | |
187 | } | |
188 | ||
189 | ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos) | |
190 | : type(type) | |
191 | , m_capture(capture) | |
192 | , m_invert(invert) | |
193 | { | |
194 | atom.subpatternId = subpatternId; | |
195 | atom.quantityType = QuantifierFixedCount; | |
196 | atom.quantityCount = 1; | |
197 | inputPosition = inputPos; | |
198 | } | |
199 | ||
200 | static ByteTerm BOL(int inputPos) | |
201 | { | |
202 | ByteTerm term(TypeAssertionBOL); | |
203 | term.inputPosition = inputPos; | |
204 | return term; | |
205 | } | |
206 | ||
207 | static ByteTerm CheckInput(Checked<unsigned> count) | |
208 | { | |
209 | ByteTerm term(TypeCheckInput); | |
210 | term.checkInputCount = count.unsafeGet(); | |
211 | return term; | |
212 | } | |
213 | ||
214 | static ByteTerm UncheckInput(Checked<unsigned> count) | |
215 | { | |
216 | ByteTerm term(TypeUncheckInput); | |
217 | term.checkInputCount = count.unsafeGet(); | |
218 | return term; | |
219 | } | |
220 | ||
221 | static ByteTerm EOL(int inputPos) | |
222 | { | |
223 | ByteTerm term(TypeAssertionEOL); | |
224 | term.inputPosition = inputPos; | |
225 | return term; | |
226 | } | |
227 | ||
228 | static ByteTerm WordBoundary(bool invert, int inputPos) | |
229 | { | |
230 | ByteTerm term(TypeAssertionWordBoundary, invert); | |
231 | term.inputPosition = inputPos; | |
232 | return term; | |
233 | } | |
234 | ||
235 | static ByteTerm BackReference(unsigned subpatternId, int inputPos) | |
236 | { | |
237 | return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); | |
238 | } | |
239 | ||
240 | static ByteTerm BodyAlternativeBegin(bool onceThrough) | |
241 | { | |
242 | ByteTerm term(TypeBodyAlternativeBegin); | |
243 | term.alternative.next = 0; | |
244 | term.alternative.end = 0; | |
245 | term.alternative.onceThrough = onceThrough; | |
246 | return term; | |
247 | } | |
248 | ||
249 | static ByteTerm BodyAlternativeDisjunction(bool onceThrough) | |
250 | { | |
251 | ByteTerm term(TypeBodyAlternativeDisjunction); | |
252 | term.alternative.next = 0; | |
253 | term.alternative.end = 0; | |
254 | term.alternative.onceThrough = onceThrough; | |
255 | return term; | |
256 | } | |
257 | ||
258 | static ByteTerm BodyAlternativeEnd() | |
259 | { | |
260 | ByteTerm term(TypeBodyAlternativeEnd); | |
261 | term.alternative.next = 0; | |
262 | term.alternative.end = 0; | |
263 | term.alternative.onceThrough = false; | |
264 | return term; | |
265 | } | |
266 | ||
267 | static ByteTerm AlternativeBegin() | |
268 | { | |
269 | ByteTerm term(TypeAlternativeBegin); | |
270 | term.alternative.next = 0; | |
271 | term.alternative.end = 0; | |
272 | term.alternative.onceThrough = false; | |
273 | return term; | |
274 | } | |
275 | ||
276 | static ByteTerm AlternativeDisjunction() | |
277 | { | |
278 | ByteTerm term(TypeAlternativeDisjunction); | |
279 | term.alternative.next = 0; | |
280 | term.alternative.end = 0; | |
281 | term.alternative.onceThrough = false; | |
282 | return term; | |
283 | } | |
284 | ||
285 | static ByteTerm AlternativeEnd() | |
286 | { | |
287 | ByteTerm term(TypeAlternativeEnd); | |
288 | term.alternative.next = 0; | |
289 | term.alternative.end = 0; | |
290 | term.alternative.onceThrough = false; | |
291 | return term; | |
292 | } | |
293 | ||
294 | static ByteTerm SubpatternBegin() | |
295 | { | |
296 | return ByteTerm(TypeSubpatternBegin); | |
297 | } | |
298 | ||
299 | static ByteTerm SubpatternEnd() | |
300 | { | |
301 | return ByteTerm(TypeSubpatternEnd); | |
302 | } | |
303 | ||
304 | static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) | |
305 | { | |
306 | ByteTerm term(TypeDotStarEnclosure); | |
307 | term.anchors.m_bol = bolAnchor; | |
308 | term.anchors.m_eol = eolAnchor; | |
309 | return term; | |
310 | } | |
311 | ||
312 | bool invert() | |
313 | { | |
314 | return m_invert; | |
315 | } | |
316 | ||
317 | bool capture() | |
318 | { | |
319 | return m_capture; | |
320 | } | |
321 | }; | |
322 | ||
323 | class ByteDisjunction { | |
324 | WTF_MAKE_FAST_ALLOCATED; | |
325 | public: | |
326 | ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) | |
327 | : m_numSubpatterns(numSubpatterns) | |
328 | , m_frameSize(frameSize) | |
329 | { | |
330 | } | |
331 | ||
332 | Vector<ByteTerm> terms; | |
333 | unsigned m_numSubpatterns; | |
334 | unsigned m_frameSize; | |
335 | }; | |
336 | ||
337 | struct BytecodePattern { | |
338 | WTF_MAKE_FAST_ALLOCATED; | |
339 | public: | |
340 | BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<OwnPtr<ByteDisjunction> >& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator) | |
341 | : m_body(body) | |
342 | , m_ignoreCase(pattern.m_ignoreCase) | |
343 | , m_multiline(pattern.m_multiline) | |
344 | , m_allocator(allocator) | |
345 | { | |
346 | m_body->terms.shrinkToFit(); | |
347 | ||
348 | newlineCharacterClass = pattern.newlineCharacterClass(); | |
349 | wordcharCharacterClass = pattern.wordcharCharacterClass(); | |
350 | ||
351 | m_allParenthesesInfo.swap(parenthesesInfoToAdopt); | |
352 | m_allParenthesesInfo.shrinkToFit(); | |
353 | ||
354 | m_userCharacterClasses.swap(pattern.m_userCharacterClasses); | |
355 | m_userCharacterClasses.shrinkToFit(); | |
356 | } | |
357 | ||
358 | OwnPtr<ByteDisjunction> m_body; | |
359 | bool m_ignoreCase; | |
360 | bool m_multiline; | |
361 | // Each BytecodePattern is associated with a RegExp, each RegExp is associated | |
362 | // with a VM. Cache a pointer to out VM's m_regExpAllocator. | |
363 | BumpPointerAllocator* m_allocator; | |
364 | ||
365 | CharacterClass* newlineCharacterClass; | |
366 | CharacterClass* wordcharCharacterClass; | |
367 | ||
368 | private: | |
369 | Vector<OwnPtr<ByteDisjunction> > m_allParenthesesInfo; | |
370 | Vector<OwnPtr<CharacterClass> > m_userCharacterClasses; | |
371 | }; | |
372 | ||
373 | JS_EXPORT_PRIVATE PassOwnPtr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*); | |
374 | JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); | |
375 | unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); | |
376 | unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); | |
377 | ||
378 | } } // namespace JSC::Yarr | |
379 | ||
380 | #endif // YarrInterpreter_h |