]> git.saurik.com Git - apple/javascriptcore.git/blob - yarr/YarrPattern.h
e46479ef59cb14b3aa8d60778536c3df014e7cce
[apple/javascriptcore.git] / yarr / YarrPattern.h
1 /*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #ifndef YarrPattern_h
28 #define YarrPattern_h
29
30 #include <runtime/UString.h>
31 #include <wtf/CheckedArithmetic.h>
32 #include <wtf/Vector.h>
33 #include <wtf/unicode/Unicode.h>
34
35 namespace JSC { namespace Yarr {
36
37 struct PatternDisjunction;
38
39 struct CharacterRange {
40 UChar begin;
41 UChar end;
42
43 CharacterRange(UChar begin, UChar end)
44 : begin(begin)
45 , end(end)
46 {
47 }
48 };
49
50 struct CharacterClassTable : RefCounted<CharacterClassTable> {
51 const char* m_table;
52 bool m_inverted;
53 static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
54 {
55 return adoptRef(new CharacterClassTable(table, inverted));
56 }
57
58 private:
59 CharacterClassTable(const char* table, bool inverted)
60 : m_table(table)
61 , m_inverted(inverted)
62 {
63 }
64 };
65
66 struct CharacterClass {
67 WTF_MAKE_FAST_ALLOCATED;
68 public:
69 // All CharacterClass instances have to have the full set of matches and ranges,
70 // they may have an optional table for faster lookups (which must match the
71 // specified matches and ranges)
72 CharacterClass(PassRefPtr<CharacterClassTable> table)
73 : m_table(table)
74 {
75 }
76 Vector<UChar> m_matches;
77 Vector<CharacterRange> m_ranges;
78 Vector<UChar> m_matchesUnicode;
79 Vector<CharacterRange> m_rangesUnicode;
80 RefPtr<CharacterClassTable> m_table;
81 };
82
83 enum QuantifierType {
84 QuantifierFixedCount,
85 QuantifierGreedy,
86 QuantifierNonGreedy,
87 };
88
89 struct PatternTerm {
90 enum Type {
91 TypeAssertionBOL,
92 TypeAssertionEOL,
93 TypeAssertionWordBoundary,
94 TypePatternCharacter,
95 TypeCharacterClass,
96 TypeBackReference,
97 TypeForwardReference,
98 TypeParenthesesSubpattern,
99 TypeParentheticalAssertion,
100 TypeDotStarEnclosure,
101 } type;
102 bool m_capture :1;
103 bool m_invert :1;
104 union {
105 UChar patternCharacter;
106 CharacterClass* characterClass;
107 unsigned backReferenceSubpatternId;
108 struct {
109 PatternDisjunction* disjunction;
110 unsigned subpatternId;
111 unsigned lastSubpatternId;
112 bool isCopy;
113 bool isTerminal;
114 } parentheses;
115 struct {
116 bool bolAnchor : 1;
117 bool eolAnchor : 1;
118 } anchors;
119 };
120 QuantifierType quantityType;
121 Checked<unsigned> quantityCount;
122 int inputPosition;
123 unsigned frameLocation;
124
125 PatternTerm(UChar ch)
126 : type(PatternTerm::TypePatternCharacter)
127 , m_capture(false)
128 , m_invert(false)
129 {
130 patternCharacter = ch;
131 quantityType = QuantifierFixedCount;
132 quantityCount = 1;
133 }
134
135 PatternTerm(CharacterClass* charClass, bool invert)
136 : type(PatternTerm::TypeCharacterClass)
137 , m_capture(false)
138 , m_invert(invert)
139 {
140 characterClass = charClass;
141 quantityType = QuantifierFixedCount;
142 quantityCount = 1;
143 }
144
145 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
146 : type(type)
147 , m_capture(capture)
148 , m_invert(invert)
149 {
150 parentheses.disjunction = disjunction;
151 parentheses.subpatternId = subpatternId;
152 parentheses.isCopy = false;
153 parentheses.isTerminal = false;
154 quantityType = QuantifierFixedCount;
155 quantityCount = 1;
156 }
157
158 PatternTerm(Type type, bool invert = false)
159 : type(type)
160 , m_capture(false)
161 , m_invert(invert)
162 {
163 quantityType = QuantifierFixedCount;
164 quantityCount = 1;
165 }
166
167 PatternTerm(unsigned spatternId)
168 : type(TypeBackReference)
169 , m_capture(false)
170 , m_invert(false)
171 {
172 backReferenceSubpatternId = spatternId;
173 quantityType = QuantifierFixedCount;
174 quantityCount = 1;
175 }
176
177 PatternTerm(bool bolAnchor, bool eolAnchor)
178 : type(TypeDotStarEnclosure)
179 , m_capture(false)
180 , m_invert(false)
181 {
182 anchors.bolAnchor = bolAnchor;
183 anchors.eolAnchor = eolAnchor;
184 quantityType = QuantifierFixedCount;
185 quantityCount = 1;
186 }
187
188 static PatternTerm ForwardReference()
189 {
190 return PatternTerm(TypeForwardReference);
191 }
192
193 static PatternTerm BOL()
194 {
195 return PatternTerm(TypeAssertionBOL);
196 }
197
198 static PatternTerm EOL()
199 {
200 return PatternTerm(TypeAssertionEOL);
201 }
202
203 static PatternTerm WordBoundary(bool invert)
204 {
205 return PatternTerm(TypeAssertionWordBoundary, invert);
206 }
207
208 bool invert()
209 {
210 return m_invert;
211 }
212
213 bool capture()
214 {
215 return m_capture;
216 }
217
218 void quantify(unsigned count, QuantifierType type)
219 {
220 quantityCount = count;
221 quantityType = type;
222 }
223 };
224
225 struct PatternAlternative {
226 WTF_MAKE_FAST_ALLOCATED;
227 public:
228 PatternAlternative(PatternDisjunction* disjunction)
229 : m_parent(disjunction)
230 , m_onceThrough(false)
231 , m_hasFixedSize(false)
232 , m_startsWithBOL(false)
233 , m_containsBOL(false)
234 {
235 }
236
237 PatternTerm& lastTerm()
238 {
239 ASSERT(m_terms.size());
240 return m_terms[m_terms.size() - 1];
241 }
242
243 void removeLastTerm()
244 {
245 ASSERT(m_terms.size());
246 m_terms.shrink(m_terms.size() - 1);
247 }
248
249 void setOnceThrough()
250 {
251 m_onceThrough = true;
252 }
253
254 bool onceThrough()
255 {
256 return m_onceThrough;
257 }
258
259 Vector<PatternTerm> m_terms;
260 PatternDisjunction* m_parent;
261 unsigned m_minimumSize;
262 bool m_onceThrough : 1;
263 bool m_hasFixedSize : 1;
264 bool m_startsWithBOL : 1;
265 bool m_containsBOL : 1;
266 };
267
268 struct PatternDisjunction {
269 WTF_MAKE_FAST_ALLOCATED;
270 public:
271 PatternDisjunction(PatternAlternative* parent = 0)
272 : m_parent(parent)
273 , m_hasFixedSize(false)
274 {
275 }
276
277 ~PatternDisjunction()
278 {
279 deleteAllValues(m_alternatives);
280 }
281
282 PatternAlternative* addNewAlternative()
283 {
284 PatternAlternative* alternative = new PatternAlternative(this);
285 m_alternatives.append(alternative);
286 return alternative;
287 }
288
289 Vector<PatternAlternative*> m_alternatives;
290 PatternAlternative* m_parent;
291 unsigned m_minimumSize;
292 unsigned m_callFrameSize;
293 bool m_hasFixedSize;
294 };
295
296 // You probably don't want to be calling these functions directly
297 // (please to be calling newlineCharacterClass() et al on your
298 // friendly neighborhood YarrPattern instance to get nicely
299 // cached copies).
300 CharacterClass* newlineCreate();
301 CharacterClass* digitsCreate();
302 CharacterClass* spacesCreate();
303 CharacterClass* wordcharCreate();
304 CharacterClass* nondigitsCreate();
305 CharacterClass* nonspacesCreate();
306 CharacterClass* nonwordcharCreate();
307
308 struct TermChain {
309 TermChain(PatternTerm term)
310 : term(term)
311 {}
312
313 PatternTerm term;
314 Vector<TermChain> hotTerms;
315 };
316
317 struct YarrPattern {
318 YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error);
319
320 ~YarrPattern()
321 {
322 deleteAllValues(m_disjunctions);
323 deleteAllValues(m_userCharacterClasses);
324 }
325
326 void reset()
327 {
328 m_numSubpatterns = 0;
329 m_maxBackReference = 0;
330
331 m_containsBackreferences = false;
332 m_containsBOL = false;
333
334 newlineCached = 0;
335 digitsCached = 0;
336 spacesCached = 0;
337 wordcharCached = 0;
338 nondigitsCached = 0;
339 nonspacesCached = 0;
340 nonwordcharCached = 0;
341
342 deleteAllValues(m_disjunctions);
343 m_disjunctions.clear();
344 deleteAllValues(m_userCharacterClasses);
345 m_userCharacterClasses.clear();
346 }
347
348 bool containsIllegalBackReference()
349 {
350 return m_maxBackReference > m_numSubpatterns;
351 }
352
353 CharacterClass* newlineCharacterClass()
354 {
355 if (!newlineCached)
356 m_userCharacterClasses.append(newlineCached = newlineCreate());
357 return newlineCached;
358 }
359 CharacterClass* digitsCharacterClass()
360 {
361 if (!digitsCached)
362 m_userCharacterClasses.append(digitsCached = digitsCreate());
363 return digitsCached;
364 }
365 CharacterClass* spacesCharacterClass()
366 {
367 if (!spacesCached)
368 m_userCharacterClasses.append(spacesCached = spacesCreate());
369 return spacesCached;
370 }
371 CharacterClass* wordcharCharacterClass()
372 {
373 if (!wordcharCached)
374 m_userCharacterClasses.append(wordcharCached = wordcharCreate());
375 return wordcharCached;
376 }
377 CharacterClass* nondigitsCharacterClass()
378 {
379 if (!nondigitsCached)
380 m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
381 return nondigitsCached;
382 }
383 CharacterClass* nonspacesCharacterClass()
384 {
385 if (!nonspacesCached)
386 m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
387 return nonspacesCached;
388 }
389 CharacterClass* nonwordcharCharacterClass()
390 {
391 if (!nonwordcharCached)
392 m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
393 return nonwordcharCached;
394 }
395
396 bool m_ignoreCase : 1;
397 bool m_multiline : 1;
398 bool m_containsBackreferences : 1;
399 bool m_containsBOL : 1;
400 unsigned m_numSubpatterns;
401 unsigned m_maxBackReference;
402 PatternDisjunction* m_body;
403 Vector<PatternDisjunction*, 4> m_disjunctions;
404 Vector<CharacterClass*> m_userCharacterClasses;
405
406 private:
407 const char* compile(const UString& patternString);
408
409 CharacterClass* newlineCached;
410 CharacterClass* digitsCached;
411 CharacterClass* spacesCached;
412 CharacterClass* wordcharCached;
413 CharacterClass* nondigitsCached;
414 CharacterClass* nonspacesCached;
415 CharacterClass* nonwordcharCached;
416 };
417
418 } } // namespace JSC::Yarr
419
420 #endif // YarrPattern_h