]> git.saurik.com Git - apple/javascriptcore.git/blob - runtime/RegExp.cpp
JavaScriptCore-1218.tar.gz
[apple/javascriptcore.git] / runtime / RegExp.cpp
1 /*
2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 * Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
4 * Copyright (C) 2009 Torch Mobile, Inc.
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 */
22
23 #include "config.h"
24 #include "RegExp.h"
25
26 #include "Lexer.h"
27 #include "Operations.h"
28 #include "RegExpCache.h"
29 #include "Yarr.h"
30 #include "YarrJIT.h"
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wtf/Assertions.h>
35 #include <wtf/OwnArrayPtr.h>
36
37 #define REGEXP_FUNC_TEST_DATA_GEN 0
38
39 namespace JSC {
40
41 const ClassInfo RegExp::s_info = { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp) };
42
43 RegExpFlags regExpFlags(const String& string)
44 {
45 RegExpFlags flags = NoFlags;
46
47 for (unsigned i = 0; i < string.length(); ++i) {
48 switch (string[i]) {
49 case 'g':
50 if (flags & FlagGlobal)
51 return InvalidFlags;
52 flags = static_cast<RegExpFlags>(flags | FlagGlobal);
53 break;
54
55 case 'i':
56 if (flags & FlagIgnoreCase)
57 return InvalidFlags;
58 flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase);
59 break;
60
61 case 'm':
62 if (flags & FlagMultiline)
63 return InvalidFlags;
64 flags = static_cast<RegExpFlags>(flags | FlagMultiline);
65 break;
66
67 default:
68 return InvalidFlags;
69 }
70 }
71
72 return flags;
73 }
74
75 #if REGEXP_FUNC_TEST_DATA_GEN
76 class RegExpFunctionalTestCollector {
77 // This class is not thread safe.
78 protected:
79 static const char* const s_fileName;
80
81 public:
82 static RegExpFunctionalTestCollector* get();
83
84 ~RegExpFunctionalTestCollector();
85
86 void outputOneTest(RegExp*, String, int, int*, int);
87 void clearRegExp(RegExp* regExp)
88 {
89 if (regExp == m_lastRegExp)
90 m_lastRegExp = 0;
91 }
92
93 private:
94 RegExpFunctionalTestCollector();
95
96 void outputEscapedString(const String&, bool escapeSlash = false);
97
98 static RegExpFunctionalTestCollector* s_instance;
99 FILE* m_file;
100 RegExp* m_lastRegExp;
101 };
102
103 const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData";
104 RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0;
105
106 RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get()
107 {
108 if (!s_instance)
109 s_instance = new RegExpFunctionalTestCollector();
110
111 return s_instance;
112 }
113
114 void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, String s, int startOffset, int* ovector, int result)
115 {
116 if ((!m_lastRegExp) || (m_lastRegExp != regExp)) {
117 m_lastRegExp = regExp;
118 fputc('/', m_file);
119 outputEscapedString(regExp->pattern(), true);
120 fputc('/', m_file);
121 if (regExp->global())
122 fputc('g', m_file);
123 if (regExp->ignoreCase())
124 fputc('i', m_file);
125 if (regExp->multiline())
126 fputc('m', m_file);
127 fprintf(m_file, "\n");
128 }
129
130 fprintf(m_file, " \"");
131 outputEscapedString(s);
132 fprintf(m_file, "\", %d, %d, (", startOffset, result);
133 for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) {
134 int subpatternBegin = ovector[i * 2];
135 int subpatternEnd = ovector[i * 2 + 1];
136 if (subpatternBegin == -1)
137 subpatternEnd = -1;
138 fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd);
139 if (i < regExp->numSubpatterns())
140 fputs(", ", m_file);
141 }
142
143 fprintf(m_file, ")\n");
144 fflush(m_file);
145 }
146
147 RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
148 {
149 m_file = fopen(s_fileName, "r+");
150 if (!m_file)
151 m_file = fopen(s_fileName, "w+");
152
153 fseek(m_file, 0L, SEEK_END);
154 }
155
156 RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
157 {
158 fclose(m_file);
159 s_instance = 0;
160 }
161
162 void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash)
163 {
164 int len = s.length();
165
166 for (int i = 0; i < len; ++i) {
167 UChar c = s[i];
168
169 switch (c) {
170 case '\0':
171 fputs("\\0", m_file);
172 break;
173 case '\a':
174 fputs("\\a", m_file);
175 break;
176 case '\b':
177 fputs("\\b", m_file);
178 break;
179 case '\f':
180 fputs("\\f", m_file);
181 break;
182 case '\n':
183 fputs("\\n", m_file);
184 break;
185 case '\r':
186 fputs("\\r", m_file);
187 break;
188 case '\t':
189 fputs("\\t", m_file);
190 break;
191 case '\v':
192 fputs("\\v", m_file);
193 break;
194 case '/':
195 if (escapeSlash)
196 fputs("\\/", m_file);
197 else
198 fputs("/", m_file);
199 break;
200 case '\"':
201 fputs("\\\"", m_file);
202 break;
203 case '\\':
204 fputs("\\\\", m_file);
205 break;
206 case '\?':
207 fputs("\?", m_file);
208 break;
209 default:
210 if (c > 0x7f)
211 fprintf(m_file, "\\u%04x", c);
212 else
213 fputc(c, m_file);
214 break;
215 }
216 }
217 }
218 #endif
219
220 RegExp::RegExp(VM& vm, const String& patternString, RegExpFlags flags)
221 : JSCell(vm, vm.regExpStructure.get())
222 , m_state(NotCompiled)
223 , m_patternString(patternString)
224 , m_flags(flags)
225 , m_constructionError(0)
226 , m_numSubpatterns(0)
227 #if ENABLE(REGEXP_TRACING)
228 , m_rtMatchCallCount(0)
229 , m_rtMatchFoundCount(0)
230 #endif
231 {
232 }
233
234 void RegExp::finishCreation(VM& vm)
235 {
236 Base::finishCreation(vm);
237 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
238 if (m_constructionError)
239 m_state = ParseError;
240 else
241 m_numSubpatterns = pattern.m_numSubpatterns;
242 }
243
244 void RegExp::destroy(JSCell* cell)
245 {
246 RegExp* thisObject = static_cast<RegExp*>(cell);
247 #if REGEXP_FUNC_TEST_DATA_GEN
248 RegExpFunctionalTestCollector::get()->clearRegExp(this);
249 #endif
250 thisObject->RegExp::~RegExp();
251 }
252
253 RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, RegExpFlags flags)
254 {
255 RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags);
256 regExp->finishCreation(vm);
257 return regExp;
258 }
259
260 RegExp* RegExp::create(VM& vm, const String& patternString, RegExpFlags flags)
261 {
262 return vm.regExpCache()->lookupOrCreate(patternString, flags);
263 }
264
265 void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize)
266 {
267 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
268 if (m_constructionError) {
269 RELEASE_ASSERT_NOT_REACHED();
270 m_state = ParseError;
271 return;
272 }
273 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
274
275 if (!hasCode()) {
276 ASSERT(m_state == NotCompiled);
277 vm->regExpCache()->addToStrongCache(this);
278 m_state = ByteCode;
279 }
280
281 #if ENABLE(YARR_JIT)
282 if (!pattern.m_containsBackreferences && vm->canUseRegExpJIT()) {
283 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode);
284 #if ENABLE(YARR_JIT_DEBUG)
285 if (!m_regExpJITCode.isFallBack())
286 m_state = JITCode;
287 else
288 m_state = ByteCode;
289 #else
290 if (!m_regExpJITCode.isFallBack()) {
291 m_state = JITCode;
292 return;
293 }
294 #endif
295 }
296 #else
297 UNUSED_PARAM(charSize);
298 #endif
299
300 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
301 }
302
303 void RegExp::compileIfNecessary(VM& vm, Yarr::YarrCharSize charSize)
304 {
305 if (hasCode()) {
306 #if ENABLE(YARR_JIT)
307 if (m_state != JITCode)
308 return;
309 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCode()))
310 return;
311 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCode()))
312 return;
313 #else
314 return;
315 #endif
316 }
317
318 compile(&vm, charSize);
319 }
320
321 int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int, 32>& ovector)
322 {
323 #if ENABLE(REGEXP_TRACING)
324 m_rtMatchCallCount++;
325 #endif
326
327 ASSERT(m_state != ParseError);
328 compileIfNecessary(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
329
330 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
331 ovector.resize(offsetVectorSize);
332 int* offsetVector = ovector.data();
333
334 int result;
335 #if ENABLE(YARR_JIT)
336 if (m_state == JITCode) {
337 if (s.is8Bit())
338 result = m_regExpJITCode.execute(s.characters8(), startOffset, s.length(), offsetVector).start;
339 else
340 result = m_regExpJITCode.execute(s.characters16(), startOffset, s.length(), offsetVector).start;
341 #if ENABLE(YARR_JIT_DEBUG)
342 matchCompareWithInterpreter(s, startOffset, offsetVector, result);
343 #endif
344 } else
345 #endif
346 result = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
347
348 // FIXME: The YARR engine should handle unsigned or size_t length matches.
349 // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed.
350 // The offset vector handling needs to change as well.
351 // Right now we convert a match where the offsets overflowed into match failure.
352 // There are two places in WebCore that call the interpreter directly that need to
353 // have their offsets changed to int as well. They are platform/text/RegularExpression.cpp
354 // and inspector/ContentSearchUtils.cpp.
355 if (s.length() > INT_MAX) {
356 bool overflowed = false;
357
358 if (result < -1)
359 overflowed = true;
360
361 for (unsigned i = 0; i <= m_numSubpatterns; i++) {
362 if ((offsetVector[i*2] < -1) || ((offsetVector[i*2] >= 0) && (offsetVector[i*2+1] < -1))) {
363 overflowed = true;
364 offsetVector[i*2] = -1;
365 offsetVector[i*2+1] = -1;
366 }
367 }
368
369 if (overflowed)
370 result = -1;
371 }
372
373 ASSERT(result >= -1);
374
375 #if REGEXP_FUNC_TEST_DATA_GEN
376 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
377 #endif
378
379 #if ENABLE(REGEXP_TRACING)
380 if (result != -1)
381 m_rtMatchFoundCount++;
382 #endif
383
384 return result;
385 }
386
387 void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize)
388 {
389 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
390 if (m_constructionError) {
391 RELEASE_ASSERT_NOT_REACHED();
392 m_state = ParseError;
393 return;
394 }
395 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
396
397 if (!hasCode()) {
398 ASSERT(m_state == NotCompiled);
399 vm->regExpCache()->addToStrongCache(this);
400 m_state = ByteCode;
401 }
402
403 #if ENABLE(YARR_JIT)
404 if (!pattern.m_containsBackreferences && vm->canUseRegExpJIT()) {
405 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode, Yarr::MatchOnly);
406 #if ENABLE(YARR_JIT_DEBUG)
407 if (!m_regExpJITCode.isFallBack())
408 m_state = JITCode;
409 else
410 m_state = ByteCode;
411 #else
412 if (!m_regExpJITCode.isFallBack()) {
413 m_state = JITCode;
414 return;
415 }
416 #endif
417 }
418 #else
419 UNUSED_PARAM(charSize);
420 #endif
421
422 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
423 }
424
425 void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::YarrCharSize charSize)
426 {
427 if (hasCode()) {
428 #if ENABLE(YARR_JIT)
429 if (m_state != JITCode)
430 return;
431 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCodeMatchOnly()))
432 return;
433 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCodeMatchOnly()))
434 return;
435 #else
436 return;
437 #endif
438 }
439
440 compileMatchOnly(&vm, charSize);
441 }
442
443 MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset)
444 {
445 #if ENABLE(REGEXP_TRACING)
446 m_rtMatchCallCount++;
447 #endif
448
449 ASSERT(m_state != ParseError);
450 compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
451
452 #if ENABLE(YARR_JIT)
453 if (m_state == JITCode) {
454 MatchResult result = s.is8Bit() ?
455 m_regExpJITCode.execute(s.characters8(), startOffset, s.length()) :
456 m_regExpJITCode.execute(s.characters16(), startOffset, s.length());
457 #if ENABLE(REGEXP_TRACING)
458 if (!result)
459 m_rtMatchFoundCount++;
460 #endif
461 return result;
462 }
463 #endif
464
465 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
466 int* offsetVector;
467 Vector<int, 32> nonReturnedOvector;
468 nonReturnedOvector.resize(offsetVectorSize);
469 offsetVector = nonReturnedOvector.data();
470 int r = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
471 #if REGEXP_FUNC_TEST_DATA_GEN
472 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
473 #endif
474
475 if (r >= 0) {
476 #if ENABLE(REGEXP_TRACING)
477 m_rtMatchFoundCount++;
478 #endif
479 return MatchResult(r, reinterpret_cast<unsigned*>(offsetVector)[1]);
480 }
481
482 return MatchResult::failed();
483 }
484
485 void RegExp::invalidateCode()
486 {
487 if (!hasCode())
488 return;
489 m_state = NotCompiled;
490 #if ENABLE(YARR_JIT)
491 m_regExpJITCode.clear();
492 #endif
493 m_regExpBytecode.clear();
494 }
495
496 #if ENABLE(YARR_JIT_DEBUG)
497 void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult)
498 {
499 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
500 Vector<int, 32> interpreterOvector;
501 interpreterOvector.resize(offsetVectorSize);
502 int* interpreterOffsetVector = interpreterOvector.data();
503 int interpreterResult = 0;
504 int differences = 0;
505
506 // Initialize interpreterOffsetVector with the return value (index 0) and the
507 // first subpattern start indicies (even index values) set to -1.
508 // No need to init the subpattern end indicies.
509 for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)
510 interpreterOffsetVector[j] = -1;
511
512 interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector);
513
514 if (jitResult != interpreterResult)
515 differences++;
516
517 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++)
518 if ((offsetVector[j] != interpreterOffsetVector[j])
519 || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])))
520 differences++;
521
522 if (differences) {
523 dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data());
524 unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset);
525
526 dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);
527
528 if (jitResult != interpreterResult) {
529 dataLogF(" JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult);
530 differences--;
531 } else {
532 dataLogF(" Correct result = %d\n", jitResult);
533 }
534
535 if (differences) {
536 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) {
537 if (offsetVector[j] != interpreterOffsetVector[j])
538 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]);
539 if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))
540 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]);
541 }
542 }
543 }
544 }
545 #endif
546
547 #if ENABLE(REGEXP_TRACING)
548 void RegExp::printTraceData()
549 {
550 char formattedPattern[41];
551 char rawPattern[41];
552
553 strncpy(rawPattern, pattern().utf8().data(), 40);
554 rawPattern[40]= '\0';
555
556 int pattLen = strlen(rawPattern);
557
558 snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern);
559
560 #if ENABLE(YARR_JIT)
561 Yarr::YarrCodeBlock& codeBlock = m_regExpJITCode;
562
563 const size_t jitAddrSize = 20;
564 char jitAddr[jitAddrSize];
565 if (m_state == JITCode)
566 snprintf(jitAddr, jitAddrSize, "fallback");
567 else
568 snprintf(jitAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.getAddr()));
569 #else
570 const char* jitAddr = "JIT Off";
571 #endif
572
573 printf("%-40.40s %16.16s %10d %10d\n", formattedPattern, jitAddr, m_rtMatchCallCount, m_rtMatchFoundCount);
574 }
575 #endif
576
577 } // namespace JSC