]> git.saurik.com Git - apple/javascriptcore.git/blob - yarr/RegexPattern.h
3271cc162bf4ff078b8e5295ceb9cd751678157a
[apple/javascriptcore.git] / yarr / RegexPattern.h
1 /*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #ifndef RegexPattern_h
27 #define RegexPattern_h
28
29
30 #if ENABLE(YARR)
31
32 #include <wtf/Vector.h>
33 #include <wtf/unicode/Unicode.h>
34
35
36 namespace JSC { namespace Yarr {
37
38 #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
39 #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
40 #define RegexStackSpaceForBackTrackInfoBackReference 2
41 #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
42 #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1
43 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
44 #define RegexStackSpaceForBackTrackInfoParentheses 4
45
46 struct PatternDisjunction;
47
48 struct CharacterRange {
49 UChar begin;
50 UChar end;
51
52 CharacterRange(UChar begin, UChar end)
53 : begin(begin)
54 , end(end)
55 {
56 }
57 };
58
59 struct CharacterClassTable : RefCounted<CharacterClassTable> {
60 const char* m_table;
61 bool m_inverted;
62 static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
63 {
64 return adoptRef(new CharacterClassTable(table, inverted));
65 }
66
67 private:
68 CharacterClassTable(const char* table, bool inverted)
69 : m_table(table)
70 , m_inverted(inverted)
71 {
72 }
73 };
74
75 struct CharacterClass : FastAllocBase {
76 // All CharacterClass instances have to have the full set of matches and ranges,
77 // they may have an optional table for faster lookups (which must match the
78 // specified matches and ranges)
79 CharacterClass(PassRefPtr<CharacterClassTable> table)
80 : m_table(table)
81 {
82 }
83 Vector<UChar> m_matches;
84 Vector<CharacterRange> m_ranges;
85 Vector<UChar> m_matchesUnicode;
86 Vector<CharacterRange> m_rangesUnicode;
87 RefPtr<CharacterClassTable> m_table;
88 };
89
90 enum QuantifierType {
91 QuantifierFixedCount,
92 QuantifierGreedy,
93 QuantifierNonGreedy,
94 };
95
96 struct PatternTerm {
97 enum Type {
98 TypeAssertionBOL,
99 TypeAssertionEOL,
100 TypeAssertionWordBoundary,
101 TypePatternCharacter,
102 TypeCharacterClass,
103 TypeBackReference,
104 TypeForwardReference,
105 TypeParenthesesSubpattern,
106 TypeParentheticalAssertion,
107 } type;
108 bool invertOrCapture;
109 union {
110 UChar patternCharacter;
111 CharacterClass* characterClass;
112 unsigned subpatternId;
113 struct {
114 PatternDisjunction* disjunction;
115 unsigned subpatternId;
116 unsigned lastSubpatternId;
117 bool isCopy;
118 } parentheses;
119 };
120 QuantifierType quantityType;
121 unsigned quantityCount;
122 int inputPosition;
123 unsigned frameLocation;
124
125 PatternTerm(UChar ch)
126 : type(PatternTerm::TypePatternCharacter)
127 {
128 patternCharacter = ch;
129 quantityType = QuantifierFixedCount;
130 quantityCount = 1;
131 }
132
133 PatternTerm(CharacterClass* charClass, bool invert)
134 : type(PatternTerm::TypeCharacterClass)
135 , invertOrCapture(invert)
136 {
137 characterClass = charClass;
138 quantityType = QuantifierFixedCount;
139 quantityCount = 1;
140 }
141
142 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture)
143 : type(type)
144 , invertOrCapture(invertOrCapture)
145 {
146 parentheses.disjunction = disjunction;
147 parentheses.subpatternId = subpatternId;
148 parentheses.isCopy = false;
149 quantityType = QuantifierFixedCount;
150 quantityCount = 1;
151 }
152
153 PatternTerm(Type type, bool invert = false)
154 : type(type)
155 , invertOrCapture(invert)
156 {
157 quantityType = QuantifierFixedCount;
158 quantityCount = 1;
159 }
160
161 PatternTerm(unsigned spatternId)
162 : type(TypeBackReference)
163 , invertOrCapture(false)
164 {
165 subpatternId = spatternId;
166 quantityType = QuantifierFixedCount;
167 quantityCount = 1;
168 }
169
170 static PatternTerm ForwardReference()
171 {
172 return PatternTerm(TypeForwardReference);
173 }
174
175 static PatternTerm BOL()
176 {
177 return PatternTerm(TypeAssertionBOL);
178 }
179
180 static PatternTerm EOL()
181 {
182 return PatternTerm(TypeAssertionEOL);
183 }
184
185 static PatternTerm WordBoundary(bool invert)
186 {
187 return PatternTerm(TypeAssertionWordBoundary, invert);
188 }
189
190 bool invert()
191 {
192 return invertOrCapture;
193 }
194
195 bool capture()
196 {
197 return invertOrCapture;
198 }
199
200 void quantify(unsigned count, QuantifierType type)
201 {
202 quantityCount = count;
203 quantityType = type;
204 }
205 };
206
207 struct PatternAlternative : FastAllocBase {
208 PatternAlternative(PatternDisjunction* disjunction)
209 : m_parent(disjunction)
210 {
211 }
212
213 PatternTerm& lastTerm()
214 {
215 ASSERT(m_terms.size());
216 return m_terms[m_terms.size() - 1];
217 }
218
219 void removeLastTerm()
220 {
221 ASSERT(m_terms.size());
222 m_terms.shrink(m_terms.size() - 1);
223 }
224
225 Vector<PatternTerm> m_terms;
226 PatternDisjunction* m_parent;
227 unsigned m_minimumSize;
228 bool m_hasFixedSize;
229 };
230
231 struct PatternDisjunction : FastAllocBase {
232 PatternDisjunction(PatternAlternative* parent = 0)
233 : m_parent(parent)
234 {
235 }
236
237 ~PatternDisjunction()
238 {
239 deleteAllValues(m_alternatives);
240 }
241
242 PatternAlternative* addNewAlternative()
243 {
244 PatternAlternative* alternative = new PatternAlternative(this);
245 m_alternatives.append(alternative);
246 return alternative;
247 }
248
249 Vector<PatternAlternative*> m_alternatives;
250 PatternAlternative* m_parent;
251 unsigned m_minimumSize;
252 unsigned m_callFrameSize;
253 bool m_hasFixedSize;
254 };
255
256 // You probably don't want to be calling these functions directly
257 // (please to be calling newlineCharacterClass() et al on your
258 // friendly neighborhood RegexPattern instance to get nicely
259 // cached copies).
260 CharacterClass* newlineCreate();
261 CharacterClass* digitsCreate();
262 CharacterClass* spacesCreate();
263 CharacterClass* wordcharCreate();
264 CharacterClass* nondigitsCreate();
265 CharacterClass* nonspacesCreate();
266 CharacterClass* nonwordcharCreate();
267
268 struct RegexPattern {
269 RegexPattern(bool ignoreCase, bool multiline)
270 : m_ignoreCase(ignoreCase)
271 , m_multiline(multiline)
272 , m_numSubpatterns(0)
273 , m_maxBackReference(0)
274 , m_shouldFallBack(false)
275 , newlineCached(0)
276 , digitsCached(0)
277 , spacesCached(0)
278 , wordcharCached(0)
279 , nondigitsCached(0)
280 , nonspacesCached(0)
281 , nonwordcharCached(0)
282 {
283 }
284
285 ~RegexPattern()
286 {
287 deleteAllValues(m_disjunctions);
288 deleteAllValues(m_userCharacterClasses);
289 }
290
291 void reset()
292 {
293 m_numSubpatterns = 0;
294 m_maxBackReference = 0;
295
296 m_shouldFallBack = false;
297
298 newlineCached = 0;
299 digitsCached = 0;
300 spacesCached = 0;
301 wordcharCached = 0;
302 nondigitsCached = 0;
303 nonspacesCached = 0;
304 nonwordcharCached = 0;
305
306 deleteAllValues(m_disjunctions);
307 m_disjunctions.clear();
308 deleteAllValues(m_userCharacterClasses);
309 m_userCharacterClasses.clear();
310 }
311
312 bool containsIllegalBackReference()
313 {
314 return m_maxBackReference > m_numSubpatterns;
315 }
316
317 CharacterClass* newlineCharacterClass()
318 {
319 if (!newlineCached)
320 m_userCharacterClasses.append(newlineCached = newlineCreate());
321 return newlineCached;
322 }
323 CharacterClass* digitsCharacterClass()
324 {
325 if (!digitsCached)
326 m_userCharacterClasses.append(digitsCached = digitsCreate());
327 return digitsCached;
328 }
329 CharacterClass* spacesCharacterClass()
330 {
331 if (!spacesCached)
332 m_userCharacterClasses.append(spacesCached = spacesCreate());
333 return spacesCached;
334 }
335 CharacterClass* wordcharCharacterClass()
336 {
337 if (!wordcharCached)
338 m_userCharacterClasses.append(wordcharCached = wordcharCreate());
339 return wordcharCached;
340 }
341 CharacterClass* nondigitsCharacterClass()
342 {
343 if (!nondigitsCached)
344 m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
345 return nondigitsCached;
346 }
347 CharacterClass* nonspacesCharacterClass()
348 {
349 if (!nonspacesCached)
350 m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
351 return nonspacesCached;
352 }
353 CharacterClass* nonwordcharCharacterClass()
354 {
355 if (!nonwordcharCached)
356 m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
357 return nonwordcharCached;
358 }
359
360 bool m_ignoreCase;
361 bool m_multiline;
362 unsigned m_numSubpatterns;
363 unsigned m_maxBackReference;
364 bool m_shouldFallBack;
365 PatternDisjunction* m_body;
366 Vector<PatternDisjunction*, 4> m_disjunctions;
367 Vector<CharacterClass*> m_userCharacterClasses;
368
369 private:
370 CharacterClass* newlineCached;
371 CharacterClass* digitsCached;
372 CharacterClass* spacesCached;
373 CharacterClass* wordcharCached;
374 CharacterClass* nondigitsCached;
375 CharacterClass* nonspacesCached;
376 CharacterClass* nonwordcharCached;
377 };
378
379 } } // namespace JSC::Yarr
380
381 #endif
382
383 #endif // RegexPattern_h