]> git.saurik.com Git - apple/javascriptcore.git/blame_incremental - runtime/RegExp.cpp
JavaScriptCore-7600.1.4.11.8.tar.gz
[apple/javascriptcore.git] / runtime / RegExp.cpp
... / ...
CommitLineData
1/*
2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 * Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
4 * Copyright (C) 2009 Torch Mobile, Inc.
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 */
22
23#include "config.h"
24#include "RegExp.h"
25
26#include "Lexer.h"
27#include "JSCInlines.h"
28#include "RegExpCache.h"
29#include "Yarr.h"
30#include "YarrJIT.h"
31#include <wtf/Assertions.h>
32
33#define REGEXP_FUNC_TEST_DATA_GEN 0
34
35#if REGEXP_FUNC_TEST_DATA_GEN
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#endif
40
41namespace JSC {
42
43const ClassInfo RegExp::s_info = { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp) };
44
45RegExpFlags regExpFlags(const String& string)
46{
47 RegExpFlags flags = NoFlags;
48
49 for (unsigned i = 0; i < string.length(); ++i) {
50 switch (string[i]) {
51 case 'g':
52 if (flags & FlagGlobal)
53 return InvalidFlags;
54 flags = static_cast<RegExpFlags>(flags | FlagGlobal);
55 break;
56
57 case 'i':
58 if (flags & FlagIgnoreCase)
59 return InvalidFlags;
60 flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase);
61 break;
62
63 case 'm':
64 if (flags & FlagMultiline)
65 return InvalidFlags;
66 flags = static_cast<RegExpFlags>(flags | FlagMultiline);
67 break;
68
69 default:
70 return InvalidFlags;
71 }
72 }
73
74 return flags;
75}
76
77#if REGEXP_FUNC_TEST_DATA_GEN
78class RegExpFunctionalTestCollector {
79 // This class is not thread safe.
80protected:
81 static const char* const s_fileName;
82
83public:
84 static RegExpFunctionalTestCollector* get();
85
86 ~RegExpFunctionalTestCollector();
87
88 void outputOneTest(RegExp*, String, int, int*, int);
89 void clearRegExp(RegExp* regExp)
90 {
91 if (regExp == m_lastRegExp)
92 m_lastRegExp = 0;
93 }
94
95private:
96 RegExpFunctionalTestCollector();
97
98 void outputEscapedString(const String&, bool escapeSlash = false);
99
100 static RegExpFunctionalTestCollector* s_instance;
101 FILE* m_file;
102 RegExp* m_lastRegExp;
103};
104
105const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData";
106RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0;
107
108RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get()
109{
110 if (!s_instance)
111 s_instance = new RegExpFunctionalTestCollector();
112
113 return s_instance;
114}
115
116void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, String s, int startOffset, int* ovector, int result)
117{
118 if ((!m_lastRegExp) || (m_lastRegExp != regExp)) {
119 m_lastRegExp = regExp;
120 fputc('/', m_file);
121 outputEscapedString(regExp->pattern(), true);
122 fputc('/', m_file);
123 if (regExp->global())
124 fputc('g', m_file);
125 if (regExp->ignoreCase())
126 fputc('i', m_file);
127 if (regExp->multiline())
128 fputc('m', m_file);
129 fprintf(m_file, "\n");
130 }
131
132 fprintf(m_file, " \"");
133 outputEscapedString(s);
134 fprintf(m_file, "\", %d, %d, (", startOffset, result);
135 for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) {
136 int subpatternBegin = ovector[i * 2];
137 int subpatternEnd = ovector[i * 2 + 1];
138 if (subpatternBegin == -1)
139 subpatternEnd = -1;
140 fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd);
141 if (i < regExp->numSubpatterns())
142 fputs(", ", m_file);
143 }
144
145 fprintf(m_file, ")\n");
146 fflush(m_file);
147}
148
149RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
150{
151 m_file = fopen(s_fileName, "r+");
152 if (!m_file)
153 m_file = fopen(s_fileName, "w+");
154
155 fseek(m_file, 0L, SEEK_END);
156}
157
158RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
159{
160 fclose(m_file);
161 s_instance = 0;
162}
163
164void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash)
165{
166 int len = s.length();
167
168 for (int i = 0; i < len; ++i) {
169 UChar c = s[i];
170
171 switch (c) {
172 case '\0':
173 fputs("\\0", m_file);
174 break;
175 case '\a':
176 fputs("\\a", m_file);
177 break;
178 case '\b':
179 fputs("\\b", m_file);
180 break;
181 case '\f':
182 fputs("\\f", m_file);
183 break;
184 case '\n':
185 fputs("\\n", m_file);
186 break;
187 case '\r':
188 fputs("\\r", m_file);
189 break;
190 case '\t':
191 fputs("\\t", m_file);
192 break;
193 case '\v':
194 fputs("\\v", m_file);
195 break;
196 case '/':
197 if (escapeSlash)
198 fputs("\\/", m_file);
199 else
200 fputs("/", m_file);
201 break;
202 case '\"':
203 fputs("\\\"", m_file);
204 break;
205 case '\\':
206 fputs("\\\\", m_file);
207 break;
208 case '\?':
209 fputs("\?", m_file);
210 break;
211 default:
212 if (c > 0x7f)
213 fprintf(m_file, "\\u%04x", c);
214 else
215 fputc(c, m_file);
216 break;
217 }
218 }
219}
220#endif
221
222RegExp::RegExp(VM& vm, const String& patternString, RegExpFlags flags)
223 : JSCell(vm, vm.regExpStructure.get())
224 , m_state(NotCompiled)
225 , m_patternString(patternString)
226 , m_flags(flags)
227 , m_constructionError(0)
228 , m_numSubpatterns(0)
229#if ENABLE(REGEXP_TRACING)
230 , m_rtMatchOnlyTotalSubjectStringLen(0.0)
231 , m_rtMatchTotalSubjectStringLen(0.0)
232 , m_rtMatchOnlyCallCount(0)
233 , m_rtMatchOnlyFoundCount(0)
234 , m_rtMatchCallCount(0)
235 , m_rtMatchFoundCount(0)
236#endif
237{
238}
239
240void RegExp::finishCreation(VM& vm)
241{
242 Base::finishCreation(vm);
243 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
244 if (m_constructionError)
245 m_state = ParseError;
246 else
247 m_numSubpatterns = pattern.m_numSubpatterns;
248}
249
250void RegExp::destroy(JSCell* cell)
251{
252 RegExp* thisObject = static_cast<RegExp*>(cell);
253#if REGEXP_FUNC_TEST_DATA_GEN
254 RegExpFunctionalTestCollector::get()->clearRegExp(this);
255#endif
256 thisObject->RegExp::~RegExp();
257}
258
259RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, RegExpFlags flags)
260{
261 RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags);
262 regExp->finishCreation(vm);
263 return regExp;
264}
265
266RegExp* RegExp::create(VM& vm, const String& patternString, RegExpFlags flags)
267{
268 return vm.regExpCache()->lookupOrCreate(patternString, flags);
269}
270
271void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize)
272{
273 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
274 if (m_constructionError) {
275 RELEASE_ASSERT_NOT_REACHED();
276 m_state = ParseError;
277 return;
278 }
279 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
280
281 if (!hasCode()) {
282 ASSERT(m_state == NotCompiled);
283 vm->regExpCache()->addToStrongCache(this);
284 m_state = ByteCode;
285 }
286
287#if ENABLE(YARR_JIT)
288 if (!pattern.m_containsBackreferences && !pattern.containsUnsignedLengthPattern() && vm->canUseRegExpJIT()) {
289 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode);
290#if ENABLE(YARR_JIT_DEBUG)
291 if (!m_regExpJITCode.isFallBack())
292 m_state = JITCode;
293 else
294 m_state = ByteCode;
295#else
296 if (!m_regExpJITCode.isFallBack()) {
297 m_state = JITCode;
298 return;
299 }
300#endif
301 }
302#else
303 UNUSED_PARAM(charSize);
304#endif
305
306 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
307}
308
309void RegExp::compileIfNecessary(VM& vm, Yarr::YarrCharSize charSize)
310{
311 if (hasCode()) {
312#if ENABLE(YARR_JIT)
313 if (m_state != JITCode)
314 return;
315 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCode()))
316 return;
317 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCode()))
318 return;
319#else
320 return;
321#endif
322 }
323
324 compile(&vm, charSize);
325}
326
327int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int, 32>& ovector)
328{
329#if ENABLE(REGEXP_TRACING)
330 m_rtMatchCallCount++;
331 m_rtMatchTotalSubjectStringLen += (double)(s.length() - startOffset);
332#endif
333
334 ASSERT(m_state != ParseError);
335 compileIfNecessary(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
336
337 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
338 ovector.resize(offsetVectorSize);
339 int* offsetVector = ovector.data();
340
341 int result;
342#if ENABLE(YARR_JIT)
343 if (m_state == JITCode) {
344 if (s.is8Bit())
345 result = m_regExpJITCode.execute(s.characters8(), startOffset, s.length(), offsetVector).start;
346 else
347 result = m_regExpJITCode.execute(s.characters16(), startOffset, s.length(), offsetVector).start;
348#if ENABLE(YARR_JIT_DEBUG)
349 matchCompareWithInterpreter(s, startOffset, offsetVector, result);
350#endif
351 } else
352#endif
353 result = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
354
355 // FIXME: The YARR engine should handle unsigned or size_t length matches.
356 // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed.
357 // The offset vector handling needs to change as well.
358 // Right now we convert a match where the offsets overflowed into match failure.
359 // There are two places in WebCore that call the interpreter directly that need to
360 // have their offsets changed to int as well. They are yarr/RegularExpression.cpp
361 // and inspector/ContentSearchUtilities.cpp
362 if (s.length() > INT_MAX) {
363 bool overflowed = false;
364
365 if (result < -1)
366 overflowed = true;
367
368 for (unsigned i = 0; i <= m_numSubpatterns; i++) {
369 if ((offsetVector[i*2] < -1) || ((offsetVector[i*2] >= 0) && (offsetVector[i*2+1] < -1))) {
370 overflowed = true;
371 offsetVector[i*2] = -1;
372 offsetVector[i*2+1] = -1;
373 }
374 }
375
376 if (overflowed)
377 result = -1;
378 }
379
380 ASSERT(result >= -1);
381
382#if REGEXP_FUNC_TEST_DATA_GEN
383 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
384#endif
385
386#if ENABLE(REGEXP_TRACING)
387 if (result != -1)
388 m_rtMatchFoundCount++;
389#endif
390
391 return result;
392}
393
394void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize)
395{
396 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
397 if (m_constructionError) {
398 RELEASE_ASSERT_NOT_REACHED();
399 m_state = ParseError;
400 return;
401 }
402 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
403
404 if (!hasCode()) {
405 ASSERT(m_state == NotCompiled);
406 vm->regExpCache()->addToStrongCache(this);
407 m_state = ByteCode;
408 }
409
410#if ENABLE(YARR_JIT)
411 if (!pattern.m_containsBackreferences && !pattern.containsUnsignedLengthPattern() && vm->canUseRegExpJIT()) {
412 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode, Yarr::MatchOnly);
413#if ENABLE(YARR_JIT_DEBUG)
414 if (!m_regExpJITCode.isFallBack())
415 m_state = JITCode;
416 else
417 m_state = ByteCode;
418#else
419 if (!m_regExpJITCode.isFallBack()) {
420 m_state = JITCode;
421 return;
422 }
423#endif
424 }
425#else
426 UNUSED_PARAM(charSize);
427#endif
428
429 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
430}
431
432void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::YarrCharSize charSize)
433{
434 if (hasCode()) {
435#if ENABLE(YARR_JIT)
436 if (m_state != JITCode)
437 return;
438 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCodeMatchOnly()))
439 return;
440 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCodeMatchOnly()))
441 return;
442#else
443 return;
444#endif
445 }
446
447 compileMatchOnly(&vm, charSize);
448}
449
450MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset)
451{
452#if ENABLE(REGEXP_TRACING)
453 m_rtMatchOnlyCallCount++;
454 m_rtMatchOnlyTotalSubjectStringLen += (double)(s.length() - startOffset);
455#endif
456
457 ASSERT(m_state != ParseError);
458 compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
459
460#if ENABLE(YARR_JIT)
461 if (m_state == JITCode) {
462 MatchResult result = s.is8Bit() ?
463 m_regExpJITCode.execute(s.characters8(), startOffset, s.length()) :
464 m_regExpJITCode.execute(s.characters16(), startOffset, s.length());
465#if ENABLE(REGEXP_TRACING)
466 if (!result)
467 m_rtMatchOnlyFoundCount++;
468#endif
469 return result;
470 }
471#endif
472
473 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
474 int* offsetVector;
475 Vector<int, 32> nonReturnedOvector;
476 nonReturnedOvector.resize(offsetVectorSize);
477 offsetVector = nonReturnedOvector.data();
478 int r = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
479#if REGEXP_FUNC_TEST_DATA_GEN
480 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
481#endif
482
483 if (r >= 0) {
484#if ENABLE(REGEXP_TRACING)
485 m_rtMatchOnlyFoundCount++;
486#endif
487 return MatchResult(r, reinterpret_cast<unsigned*>(offsetVector)[1]);
488 }
489
490 return MatchResult::failed();
491}
492
493void RegExp::invalidateCode()
494{
495 if (!hasCode())
496 return;
497 m_state = NotCompiled;
498#if ENABLE(YARR_JIT)
499 m_regExpJITCode.clear();
500#endif
501 m_regExpBytecode.clear();
502}
503
504#if ENABLE(YARR_JIT_DEBUG)
505void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult)
506{
507 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
508 Vector<int, 32> interpreterOvector;
509 interpreterOvector.resize(offsetVectorSize);
510 int* interpreterOffsetVector = interpreterOvector.data();
511 int interpreterResult = 0;
512 int differences = 0;
513
514 // Initialize interpreterOffsetVector with the return value (index 0) and the
515 // first subpattern start indicies (even index values) set to -1.
516 // No need to init the subpattern end indicies.
517 for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)
518 interpreterOffsetVector[j] = -1;
519
520 interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector);
521
522 if (jitResult != interpreterResult)
523 differences++;
524
525 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++)
526 if ((offsetVector[j] != interpreterOffsetVector[j])
527 || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])))
528 differences++;
529
530 if (differences) {
531 dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data());
532 unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset);
533
534 dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);
535
536 if (jitResult != interpreterResult) {
537 dataLogF(" JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult);
538 differences--;
539 } else {
540 dataLogF(" Correct result = %d\n", jitResult);
541 }
542
543 if (differences) {
544 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) {
545 if (offsetVector[j] != interpreterOffsetVector[j])
546 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]);
547 if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))
548 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]);
549 }
550 }
551 }
552}
553#endif
554
555#if ENABLE(REGEXP_TRACING)
556 void RegExp::printTraceData()
557 {
558 char formattedPattern[41];
559 char rawPattern[41];
560
561 strncpy(rawPattern, pattern().utf8().data(), 40);
562 rawPattern[40]= '\0';
563
564 int pattLen = strlen(rawPattern);
565
566 snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern);
567
568#if ENABLE(YARR_JIT)
569 Yarr::YarrCodeBlock& codeBlock = m_regExpJITCode;
570
571 const size_t jitAddrSize = 20;
572 char jit8BitMatchOnlyAddr[jitAddrSize];
573 char jit16BitMatchOnlyAddr[jitAddrSize];
574 char jit8BitMatchAddr[jitAddrSize];
575 char jit16BitMatchAddr[jitAddrSize];
576 if (m_state == ByteCode) {
577 snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "fallback ");
578 snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "---- ");
579 snprintf(jit8BitMatchAddr, jitAddrSize, "fallback ");
580 snprintf(jit16BitMatchAddr, jitAddrSize, "---- ");
581 } else {
582 snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get8BitMatchOnlyAddr()));
583 snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get16BitMatchOnlyAddr()));
584 snprintf(jit8BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get8BitMatchAddr()));
585 snprintf(jit16BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.get16BitMatchAddr()));
586 }
587#else
588 const char* jit8BitMatchOnlyAddr = "JIT Off";
589 const char* jit16BitMatchOnlyAddr = "";
590 const char* jit8BitMatchAddr = "JIT Off";
591 const char* jit16BitMatchAddr = "";
592#endif
593 unsigned averageMatchOnlyStringLen = (unsigned)(m_rtMatchOnlyTotalSubjectStringLen / m_rtMatchOnlyCallCount);
594 unsigned averageMatchStringLen = (unsigned)(m_rtMatchTotalSubjectStringLen / m_rtMatchCallCount);
595
596 printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n", formattedPattern, jit8BitMatchOnlyAddr, jit16BitMatchOnlyAddr, m_rtMatchOnlyCallCount, m_rtMatchOnlyFoundCount, averageMatchOnlyStringLen);
597 printf(" %16.16s %16.16s %10d %10d %10u\n", jit8BitMatchAddr, jit16BitMatchAddr, m_rtMatchCallCount, m_rtMatchFoundCount, averageMatchStringLen);
598 }
599#endif
600
601} // namespace JSC