]> git.saurik.com Git - apple/javascriptcore.git/blob - runtime/RegExp.cpp
JavaScriptCore-7601.1.46.3.tar.gz
[apple/javascriptcore.git] / runtime / RegExp.cpp
1 /*
2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 * Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
4 * Copyright (C) 2009 Torch Mobile, Inc.
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 */
22
23 #include "config.h"
24 #include "RegExp.h"
25
26 #include "Lexer.h"
27 #include "JSCInlines.h"
28 #include "RegExpCache.h"
29 #include "Yarr.h"
30 #include "YarrJIT.h"
31 #include <wtf/Assertions.h>
32
33 #define REGEXP_FUNC_TEST_DATA_GEN 0
34
35 #if REGEXP_FUNC_TEST_DATA_GEN
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #endif
40
41 namespace JSC {
42
43 const ClassInfo RegExp::s_info = { "RegExp", 0, 0, CREATE_METHOD_TABLE(RegExp) };
44
45 RegExpFlags regExpFlags(const String& string)
46 {
47 RegExpFlags flags = NoFlags;
48
49 for (unsigned i = 0; i < string.length(); ++i) {
50 switch (string[i]) {
51 case 'g':
52 if (flags & FlagGlobal)
53 return InvalidFlags;
54 flags = static_cast<RegExpFlags>(flags | FlagGlobal);
55 break;
56
57 case 'i':
58 if (flags & FlagIgnoreCase)
59 return InvalidFlags;
60 flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase);
61 break;
62
63 case 'm':
64 if (flags & FlagMultiline)
65 return InvalidFlags;
66 flags = static_cast<RegExpFlags>(flags | FlagMultiline);
67 break;
68
69 default:
70 return InvalidFlags;
71 }
72 }
73
74 return flags;
75 }
76
77 #if REGEXP_FUNC_TEST_DATA_GEN
78 class RegExpFunctionalTestCollector {
79 // This class is not thread safe.
80 protected:
81 static const char* const s_fileName;
82
83 public:
84 static RegExpFunctionalTestCollector* get();
85
86 ~RegExpFunctionalTestCollector();
87
88 void outputOneTest(RegExp*, String, int, int*, int);
89 void clearRegExp(RegExp* regExp)
90 {
91 if (regExp == m_lastRegExp)
92 m_lastRegExp = 0;
93 }
94
95 private:
96 RegExpFunctionalTestCollector();
97
98 void outputEscapedString(const String&, bool escapeSlash = false);
99
100 static RegExpFunctionalTestCollector* s_instance;
101 FILE* m_file;
102 RegExp* m_lastRegExp;
103 };
104
105 const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData";
106 RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0;
107
108 RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get()
109 {
110 if (!s_instance)
111 s_instance = new RegExpFunctionalTestCollector();
112
113 return s_instance;
114 }
115
116 void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, const String& s, int startOffset, int* ovector, int result)
117 {
118 if ((!m_lastRegExp) || (m_lastRegExp != regExp)) {
119 m_lastRegExp = regExp;
120 fputc('/', m_file);
121 outputEscapedString(regExp->pattern(), true);
122 fputc('/', m_file);
123 if (regExp->global())
124 fputc('g', m_file);
125 if (regExp->ignoreCase())
126 fputc('i', m_file);
127 if (regExp->multiline())
128 fputc('m', m_file);
129 fprintf(m_file, "\n");
130 }
131
132 fprintf(m_file, " \"");
133 outputEscapedString(s);
134 fprintf(m_file, "\", %d, %d, (", startOffset, result);
135 for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) {
136 int subpatternBegin = ovector[i * 2];
137 int subpatternEnd = ovector[i * 2 + 1];
138 if (subpatternBegin == -1)
139 subpatternEnd = -1;
140 fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd);
141 if (i < regExp->numSubpatterns())
142 fputs(", ", m_file);
143 }
144
145 fprintf(m_file, ")\n");
146 fflush(m_file);
147 }
148
149 RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
150 {
151 m_file = fopen(s_fileName, "r+");
152 if (!m_file)
153 m_file = fopen(s_fileName, "w+");
154
155 fseek(m_file, 0L, SEEK_END);
156 }
157
158 RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
159 {
160 fclose(m_file);
161 s_instance = 0;
162 }
163
164 void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash)
165 {
166 int len = s.length();
167
168 for (int i = 0; i < len; ++i) {
169 UChar c = s[i];
170
171 switch (c) {
172 case '\0':
173 fputs("\\0", m_file);
174 break;
175 case '\a':
176 fputs("\\a", m_file);
177 break;
178 case '\b':
179 fputs("\\b", m_file);
180 break;
181 case '\f':
182 fputs("\\f", m_file);
183 break;
184 case '\n':
185 fputs("\\n", m_file);
186 break;
187 case '\r':
188 fputs("\\r", m_file);
189 break;
190 case '\t':
191 fputs("\\t", m_file);
192 break;
193 case '\v':
194 fputs("\\v", m_file);
195 break;
196 case '/':
197 if (escapeSlash)
198 fputs("\\/", m_file);
199 else
200 fputs("/", m_file);
201 break;
202 case '\"':
203 fputs("\\\"", m_file);
204 break;
205 case '\\':
206 fputs("\\\\", m_file);
207 break;
208 case '\?':
209 fputs("\?", m_file);
210 break;
211 default:
212 if (c > 0x7f)
213 fprintf(m_file, "\\u%04x", c);
214 else
215 fputc(c, m_file);
216 break;
217 }
218 }
219 }
220 #endif
221
222 RegExp::RegExp(VM& vm, const String& patternString, RegExpFlags flags)
223 : JSCell(vm, vm.regExpStructure.get())
224 , m_state(NotCompiled)
225 , m_patternString(patternString)
226 , m_flags(flags)
227 , m_constructionError(0)
228 , m_numSubpatterns(0)
229 #if ENABLE(REGEXP_TRACING)
230 , m_rtMatchOnlyTotalSubjectStringLen(0.0)
231 , m_rtMatchTotalSubjectStringLen(0.0)
232 , m_rtMatchOnlyCallCount(0)
233 , m_rtMatchOnlyFoundCount(0)
234 , m_rtMatchCallCount(0)
235 , m_rtMatchFoundCount(0)
236 #endif
237 {
238 }
239
240 void RegExp::finishCreation(VM& vm)
241 {
242 Base::finishCreation(vm);
243 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
244 if (m_constructionError)
245 m_state = ParseError;
246 else
247 m_numSubpatterns = pattern.m_numSubpatterns;
248 }
249
250 void RegExp::destroy(JSCell* cell)
251 {
252 RegExp* thisObject = static_cast<RegExp*>(cell);
253 #if REGEXP_FUNC_TEST_DATA_GEN
254 RegExpFunctionalTestCollector::get()->clearRegExp(this);
255 #endif
256 thisObject->RegExp::~RegExp();
257 }
258
259 RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, RegExpFlags flags)
260 {
261 RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags);
262 regExp->finishCreation(vm);
263 return regExp;
264 }
265
266 RegExp* RegExp::create(VM& vm, const String& patternString, RegExpFlags flags)
267 {
268 return vm.regExpCache()->lookupOrCreate(patternString, flags);
269 }
270
271 void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize)
272 {
273 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
274 if (m_constructionError) {
275 RELEASE_ASSERT_NOT_REACHED();
276 #if COMPILER_QUIRK(CONSIDERS_UNREACHABLE_CODE)
277 m_state = ParseError;
278 return;
279 #endif
280 }
281 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
282
283 if (!hasCode()) {
284 ASSERT(m_state == NotCompiled);
285 vm->regExpCache()->addToStrongCache(this);
286 m_state = ByteCode;
287 }
288
289 #if ENABLE(YARR_JIT)
290 if (!pattern.m_containsBackreferences && !pattern.containsUnsignedLengthPattern() && vm->canUseRegExpJIT()) {
291 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode);
292 if (!m_regExpJITCode.isFallBack()) {
293 m_state = JITCode;
294 return;
295 }
296 }
297 #else
298 UNUSED_PARAM(charSize);
299 #endif
300
301 m_state = ByteCode;
302 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
303 }
304
305 void RegExp::compileIfNecessary(VM& vm, Yarr::YarrCharSize charSize)
306 {
307 if (hasCode()) {
308 #if ENABLE(YARR_JIT)
309 if (m_state != JITCode)
310 return;
311 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCode()))
312 return;
313 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCode()))
314 return;
315 #else
316 return;
317 #endif
318 }
319
320 compile(&vm, charSize);
321 }
322
323 int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int, 32>& ovector)
324 {
325 #if ENABLE(REGEXP_TRACING)
326 m_rtMatchCallCount++;
327 m_rtMatchTotalSubjectStringLen += (double)(s.length() - startOffset);
328 #endif
329
330 ASSERT(m_state != ParseError);
331 compileIfNecessary(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
332
333 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
334 ovector.resize(offsetVectorSize);
335 int* offsetVector = ovector.data();
336
337 int result;
338 #if ENABLE(YARR_JIT)
339 if (m_state == JITCode) {
340 if (s.is8Bit())
341 result = m_regExpJITCode.execute(s.characters8(), startOffset, s.length(), offsetVector).start;
342 else
343 result = m_regExpJITCode.execute(s.characters16(), startOffset, s.length(), offsetVector).start;
344 #if ENABLE(YARR_JIT_DEBUG)
345 matchCompareWithInterpreter(s, startOffset, offsetVector, result);
346 #endif
347 } else
348 #endif
349 result = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
350
351 // FIXME: The YARR engine should handle unsigned or size_t length matches.
352 // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed.
353 // The offset vector handling needs to change as well.
354 // Right now we convert a match where the offsets overflowed into match failure.
355 // There are two places in WebCore that call the interpreter directly that need to
356 // have their offsets changed to int as well. They are yarr/RegularExpression.cpp
357 // and inspector/ContentSearchUtilities.cpp
358 if (s.length() > INT_MAX) {
359 bool overflowed = false;
360
361 if (result < -1)
362 overflowed = true;
363
364 for (unsigned i = 0; i <= m_numSubpatterns; i++) {
365 if ((offsetVector[i*2] < -1) || ((offsetVector[i*2] >= 0) && (offsetVector[i*2+1] < -1))) {
366 overflowed = true;
367 offsetVector[i*2] = -1;
368 offsetVector[i*2+1] = -1;
369 }
370 }
371
372 if (overflowed)
373 result = -1;
374 }
375
376 ASSERT(result >= -1);
377
378 #if REGEXP_FUNC_TEST_DATA_GEN
379 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
380 #endif
381
382 #if ENABLE(REGEXP_TRACING)
383 if (result != -1)
384 m_rtMatchFoundCount++;
385 #endif
386
387 return result;
388 }
389
390 void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize)
391 {
392 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
393 if (m_constructionError) {
394 RELEASE_ASSERT_NOT_REACHED();
395 #if COMPILER_QUIRK(CONSIDERS_UNREACHABLE_CODE)
396 m_state = ParseError;
397 return;
398 #endif
399 }
400 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
401
402 if (!hasCode()) {
403 ASSERT(m_state == NotCompiled);
404 vm->regExpCache()->addToStrongCache(this);
405 m_state = ByteCode;
406 }
407
408 #if ENABLE(YARR_JIT)
409 if (!pattern.m_containsBackreferences && !pattern.containsUnsignedLengthPattern() && vm->canUseRegExpJIT()) {
410 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode, Yarr::MatchOnly);
411 if (!m_regExpJITCode.isFallBack()) {
412 m_state = JITCode;
413 return;
414 }
415 }
416 #else
417 UNUSED_PARAM(charSize);
418 #endif
419
420 m_state = ByteCode;
421 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
422 }
423
424 void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::YarrCharSize charSize)
425 {
426 if (hasCode()) {
427 #if ENABLE(YARR_JIT)
428 if (m_state != JITCode)
429 return;
430 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCodeMatchOnly()))
431 return;
432 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCodeMatchOnly()))
433 return;
434 #else
435 return;
436 #endif
437 }
438
439 compileMatchOnly(&vm, charSize);
440 }
441
442 MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset)
443 {
444 #if ENABLE(REGEXP_TRACING)
445 m_rtMatchOnlyCallCount++;
446 m_rtMatchOnlyTotalSubjectStringLen += (double)(s.length() - startOffset);
447 #endif
448
449 ASSERT(m_state != ParseError);
450 compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
451
452 #if ENABLE(YARR_JIT)
453 if (m_state == JITCode) {
454 MatchResult result = s.is8Bit() ?
455 m_regExpJITCode.execute(s.characters8(), startOffset, s.length()) :
456 m_regExpJITCode.execute(s.characters16(), startOffset, s.length());
457 #if ENABLE(REGEXP_TRACING)
458 if (!result)
459 m_rtMatchOnlyFoundCount++;
460 #endif
461 return result;
462 }
463 #endif
464
465 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
466 int* offsetVector;
467 Vector<int, 32> nonReturnedOvector;
468 nonReturnedOvector.resize(offsetVectorSize);
469 offsetVector = nonReturnedOvector.data();
470 int r = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
471 #if REGEXP_FUNC_TEST_DATA_GEN
472 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
473 #endif
474
475 if (r >= 0) {
476 #if ENABLE(REGEXP_TRACING)
477 m_rtMatchOnlyFoundCount++;
478 #endif
479 return MatchResult(r, reinterpret_cast<unsigned*>(offsetVector)[1]);
480 }
481
482 return MatchResult::failed();
483 }
484
485 void RegExp::invalidateCode()
486 {
487 if (!hasCode())
488 return;
489 m_state = NotCompiled;
490 #if ENABLE(YARR_JIT)
491 m_regExpJITCode.clear();
492 #endif
493 m_regExpBytecode = nullptr;
494 }
495
496 #if ENABLE(YARR_JIT_DEBUG)
497 void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult)
498 {
499 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
500 Vector<int, 32> interpreterOvector;
501 interpreterOvector.resize(offsetVectorSize);
502 int* interpreterOffsetVector = interpreterOvector.data();
503 int interpreterResult = 0;
504 int differences = 0;
505
506 // Initialize interpreterOffsetVector with the return value (index 0) and the
507 // first subpattern start indicies (even index values) set to -1.
508 // No need to init the subpattern end indicies.
509 for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)
510 interpreterOffsetVector[j] = -1;
511
512 interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector);
513
514 if (jitResult != interpreterResult)
515 differences++;
516
517 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++)
518 if ((offsetVector[j] != interpreterOffsetVector[j])
519 || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])))
520 differences++;
521
522 if (differences) {
523 dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data());
524 unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset);
525
526 dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);
527
528 if (jitResult != interpreterResult) {
529 dataLogF(" JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult);
530 differences--;
531 } else {
532 dataLogF(" Correct result = %d\n", jitResult);
533 }
534
535 if (differences) {
536 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) {
537 if (offsetVector[j] != interpreterOffsetVector[j])
538 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]);
539 if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))
540 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]);
541 }
542 }
543 }
544 }
545 #endif
546
547 #if ENABLE(REGEXP_TRACING)
548 void RegExp::printTraceData()
549 {
550 char formattedPattern[41];
551 char rawPattern[41];
552
553 strncpy(rawPattern, pattern().utf8().data(), 40);
554 rawPattern[40]= '\0';
555
556 int pattLen = strlen(rawPattern);
557
558 snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern);
559
560 #if ENABLE(YARR_JIT)
561 Yarr::YarrCodeBlock& codeBlock = m_regExpJITCode;
562
563 const size_t jitAddrSize = 20;
564 char jit8BitMatchOnlyAddr[jitAddrSize];
565 char jit16BitMatchOnlyAddr[jitAddrSize];
566 char jit8BitMatchAddr[jitAddrSize];
567 char jit16BitMatchAddr[jitAddrSize];
568 if (m_state == ByteCode) {
569 snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "fallback ");
570 snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "---- ");
571 snprintf(jit8BitMatchAddr, jitAddrSize, "fallback ");
572 snprintf(jit16BitMatchAddr, jitAddrSize, "---- ");
573 } else {
574 snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get8BitMatchOnlyAddr()));
575 snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get16BitMatchOnlyAddr()));
576 snprintf(jit8BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get8BitMatchAddr()));
577 snprintf(jit16BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get16BitMatchAddr()));
578 }
579 #else
580 const char* jit8BitMatchOnlyAddr = "JIT Off";
581 const char* jit16BitMatchOnlyAddr = "";
582 const char* jit8BitMatchAddr = "JIT Off";
583 const char* jit16BitMatchAddr = "";
584 #endif
585 unsigned averageMatchOnlyStringLen = (unsigned)(m_rtMatchOnlyTotalSubjectStringLen / m_rtMatchOnlyCallCount);
586 unsigned averageMatchStringLen = (unsigned)(m_rtMatchTotalSubjectStringLen / m_rtMatchCallCount);
587
588 printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n", formattedPattern, jit8BitMatchOnlyAddr, jit16BitMatchOnlyAddr, m_rtMatchOnlyCallCount, m_rtMatchOnlyFoundCount, averageMatchOnlyStringLen);
589 printf(" %16.16s %16.16s %10d %10d %10u\n", jit8BitMatchAddr, jit16BitMatchAddr, m_rtMatchCallCount, m_rtMatchFoundCount, averageMatchStringLen);
590 }
591 #endif
592
593 } // namespace JSC