2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 * Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
4 * Copyright (C) 2009 Torch Mobile, Inc.
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 #include "JSCInlines.h"
28 #include "RegExpCache.h"
31 #include <wtf/Assertions.h>
33 #define REGEXP_FUNC_TEST_DATA_GEN 0
35 #if REGEXP_FUNC_TEST_DATA_GEN
43 const ClassInfo
RegExp::s_info
= { "RegExp", 0, 0, CREATE_METHOD_TABLE(RegExp
) };
45 RegExpFlags
regExpFlags(const String
& string
)
47 RegExpFlags flags
= NoFlags
;
49 for (unsigned i
= 0; i
< string
.length(); ++i
) {
52 if (flags
& FlagGlobal
)
54 flags
= static_cast<RegExpFlags
>(flags
| FlagGlobal
);
58 if (flags
& FlagIgnoreCase
)
60 flags
= static_cast<RegExpFlags
>(flags
| FlagIgnoreCase
);
64 if (flags
& FlagMultiline
)
66 flags
= static_cast<RegExpFlags
>(flags
| FlagMultiline
);
77 #if REGEXP_FUNC_TEST_DATA_GEN
78 class RegExpFunctionalTestCollector
{
79 // This class is not thread safe.
81 static const char* const s_fileName
;
84 static RegExpFunctionalTestCollector
* get();
86 ~RegExpFunctionalTestCollector();
88 void outputOneTest(RegExp
*, String
, int, int*, int);
89 void clearRegExp(RegExp
* regExp
)
91 if (regExp
== m_lastRegExp
)
96 RegExpFunctionalTestCollector();
98 void outputEscapedString(const String
&, bool escapeSlash
= false);
100 static RegExpFunctionalTestCollector
* s_instance
;
102 RegExp
* m_lastRegExp
;
105 const char* const RegExpFunctionalTestCollector::s_fileName
= "/tmp/RegExpTestsData";
106 RegExpFunctionalTestCollector
* RegExpFunctionalTestCollector::s_instance
= 0;
108 RegExpFunctionalTestCollector
* RegExpFunctionalTestCollector::get()
111 s_instance
= new RegExpFunctionalTestCollector();
116 void RegExpFunctionalTestCollector::outputOneTest(RegExp
* regExp
, const String
& s
, int startOffset
, int* ovector
, int result
)
118 if ((!m_lastRegExp
) || (m_lastRegExp
!= regExp
)) {
119 m_lastRegExp
= regExp
;
121 outputEscapedString(regExp
->pattern(), true);
123 if (regExp
->global())
125 if (regExp
->ignoreCase())
127 if (regExp
->multiline())
129 fprintf(m_file
, "\n");
132 fprintf(m_file
, " \"");
133 outputEscapedString(s
);
134 fprintf(m_file
, "\", %d, %d, (", startOffset
, result
);
135 for (unsigned i
= 0; i
<= regExp
->numSubpatterns(); i
++) {
136 int subpatternBegin
= ovector
[i
* 2];
137 int subpatternEnd
= ovector
[i
* 2 + 1];
138 if (subpatternBegin
== -1)
140 fprintf(m_file
, "%d, %d", subpatternBegin
, subpatternEnd
);
141 if (i
< regExp
->numSubpatterns())
145 fprintf(m_file
, ")\n");
149 RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
151 m_file
= fopen(s_fileName
, "r+");
153 m_file
= fopen(s_fileName
, "w+");
155 fseek(m_file
, 0L, SEEK_END
);
158 RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
164 void RegExpFunctionalTestCollector::outputEscapedString(const String
& s
, bool escapeSlash
)
166 int len
= s
.length();
168 for (int i
= 0; i
< len
; ++i
) {
173 fputs("\\0", m_file
);
176 fputs("\\a", m_file
);
179 fputs("\\b", m_file
);
182 fputs("\\f", m_file
);
185 fputs("\\n", m_file
);
188 fputs("\\r", m_file
);
191 fputs("\\t", m_file
);
194 fputs("\\v", m_file
);
198 fputs("\\/", m_file
);
203 fputs("\\\"", m_file
);
206 fputs("\\\\", m_file
);
213 fprintf(m_file
, "\\u%04x", c
);
222 RegExp::RegExp(VM
& vm
, const String
& patternString
, RegExpFlags flags
)
223 : JSCell(vm
, vm
.regExpStructure
.get())
224 , m_state(NotCompiled
)
225 , m_patternString(patternString
)
227 , m_constructionError(0)
228 , m_numSubpatterns(0)
229 #if ENABLE(REGEXP_TRACING)
230 , m_rtMatchOnlyTotalSubjectStringLen(0.0)
231 , m_rtMatchTotalSubjectStringLen(0.0)
232 , m_rtMatchOnlyCallCount(0)
233 , m_rtMatchOnlyFoundCount(0)
234 , m_rtMatchCallCount(0)
235 , m_rtMatchFoundCount(0)
240 void RegExp::finishCreation(VM
& vm
)
242 Base::finishCreation(vm
);
243 Yarr::YarrPattern
pattern(m_patternString
, ignoreCase(), multiline(), &m_constructionError
);
244 if (m_constructionError
)
245 m_state
= ParseError
;
247 m_numSubpatterns
= pattern
.m_numSubpatterns
;
250 void RegExp::destroy(JSCell
* cell
)
252 RegExp
* thisObject
= static_cast<RegExp
*>(cell
);
253 #if REGEXP_FUNC_TEST_DATA_GEN
254 RegExpFunctionalTestCollector::get()->clearRegExp(this);
256 thisObject
->RegExp::~RegExp();
259 RegExp
* RegExp::createWithoutCaching(VM
& vm
, const String
& patternString
, RegExpFlags flags
)
261 RegExp
* regExp
= new (NotNull
, allocateCell
<RegExp
>(vm
.heap
)) RegExp(vm
, patternString
, flags
);
262 regExp
->finishCreation(vm
);
266 RegExp
* RegExp::create(VM
& vm
, const String
& patternString
, RegExpFlags flags
)
268 return vm
.regExpCache()->lookupOrCreate(patternString
, flags
);
271 void RegExp::compile(VM
* vm
, Yarr::YarrCharSize charSize
)
273 Yarr::YarrPattern
pattern(m_patternString
, ignoreCase(), multiline(), &m_constructionError
);
274 if (m_constructionError
) {
275 RELEASE_ASSERT_NOT_REACHED();
276 #if COMPILER_QUIRK(CONSIDERS_UNREACHABLE_CODE)
277 m_state
= ParseError
;
281 ASSERT(m_numSubpatterns
== pattern
.m_numSubpatterns
);
284 ASSERT(m_state
== NotCompiled
);
285 vm
->regExpCache()->addToStrongCache(this);
290 if (!pattern
.m_containsBackreferences
&& !pattern
.containsUnsignedLengthPattern() && vm
->canUseRegExpJIT()) {
291 Yarr::jitCompile(pattern
, charSize
, vm
, m_regExpJITCode
);
292 if (!m_regExpJITCode
.isFallBack()) {
298 UNUSED_PARAM(charSize
);
302 m_regExpBytecode
= Yarr::byteCompile(pattern
, &vm
->m_regExpAllocator
);
305 void RegExp::compileIfNecessary(VM
& vm
, Yarr::YarrCharSize charSize
)
309 if (m_state
!= JITCode
)
311 if ((charSize
== Yarr::Char8
) && (m_regExpJITCode
.has8BitCode()))
313 if ((charSize
== Yarr::Char16
) && (m_regExpJITCode
.has16BitCode()))
320 compile(&vm
, charSize
);
323 int RegExp::match(VM
& vm
, const String
& s
, unsigned startOffset
, Vector
<int, 32>& ovector
)
325 #if ENABLE(REGEXP_TRACING)
326 m_rtMatchCallCount
++;
327 m_rtMatchTotalSubjectStringLen
+= (double)(s
.length() - startOffset
);
330 ASSERT(m_state
!= ParseError
);
331 compileIfNecessary(vm
, s
.is8Bit() ? Yarr::Char8
: Yarr::Char16
);
333 int offsetVectorSize
= (m_numSubpatterns
+ 1) * 2;
334 ovector
.resize(offsetVectorSize
);
335 int* offsetVector
= ovector
.data();
339 if (m_state
== JITCode
) {
341 result
= m_regExpJITCode
.execute(s
.characters8(), startOffset
, s
.length(), offsetVector
).start
;
343 result
= m_regExpJITCode
.execute(s
.characters16(), startOffset
, s
.length(), offsetVector
).start
;
344 #if ENABLE(YARR_JIT_DEBUG)
345 matchCompareWithInterpreter(s
, startOffset
, offsetVector
, result
);
349 result
= Yarr::interpret(m_regExpBytecode
.get(), s
, startOffset
, reinterpret_cast<unsigned*>(offsetVector
));
351 // FIXME: The YARR engine should handle unsigned or size_t length matches.
352 // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed.
353 // The offset vector handling needs to change as well.
354 // Right now we convert a match where the offsets overflowed into match failure.
355 // There are two places in WebCore that call the interpreter directly that need to
356 // have their offsets changed to int as well. They are yarr/RegularExpression.cpp
357 // and inspector/ContentSearchUtilities.cpp
358 if (s
.length() > INT_MAX
) {
359 bool overflowed
= false;
364 for (unsigned i
= 0; i
<= m_numSubpatterns
; i
++) {
365 if ((offsetVector
[i
*2] < -1) || ((offsetVector
[i
*2] >= 0) && (offsetVector
[i
*2+1] < -1))) {
367 offsetVector
[i
*2] = -1;
368 offsetVector
[i
*2+1] = -1;
376 ASSERT(result
>= -1);
378 #if REGEXP_FUNC_TEST_DATA_GEN
379 RegExpFunctionalTestCollector::get()->outputOneTest(this, s
, startOffset
, offsetVector
, result
);
382 #if ENABLE(REGEXP_TRACING)
384 m_rtMatchFoundCount
++;
390 void RegExp::compileMatchOnly(VM
* vm
, Yarr::YarrCharSize charSize
)
392 Yarr::YarrPattern
pattern(m_patternString
, ignoreCase(), multiline(), &m_constructionError
);
393 if (m_constructionError
) {
394 RELEASE_ASSERT_NOT_REACHED();
395 #if COMPILER_QUIRK(CONSIDERS_UNREACHABLE_CODE)
396 m_state
= ParseError
;
400 ASSERT(m_numSubpatterns
== pattern
.m_numSubpatterns
);
403 ASSERT(m_state
== NotCompiled
);
404 vm
->regExpCache()->addToStrongCache(this);
409 if (!pattern
.m_containsBackreferences
&& !pattern
.containsUnsignedLengthPattern() && vm
->canUseRegExpJIT()) {
410 Yarr::jitCompile(pattern
, charSize
, vm
, m_regExpJITCode
, Yarr::MatchOnly
);
411 if (!m_regExpJITCode
.isFallBack()) {
417 UNUSED_PARAM(charSize
);
421 m_regExpBytecode
= Yarr::byteCompile(pattern
, &vm
->m_regExpAllocator
);
424 void RegExp::compileIfNecessaryMatchOnly(VM
& vm
, Yarr::YarrCharSize charSize
)
428 if (m_state
!= JITCode
)
430 if ((charSize
== Yarr::Char8
) && (m_regExpJITCode
.has8BitCodeMatchOnly()))
432 if ((charSize
== Yarr::Char16
) && (m_regExpJITCode
.has16BitCodeMatchOnly()))
439 compileMatchOnly(&vm
, charSize
);
442 MatchResult
RegExp::match(VM
& vm
, const String
& s
, unsigned startOffset
)
444 #if ENABLE(REGEXP_TRACING)
445 m_rtMatchOnlyCallCount
++;
446 m_rtMatchOnlyTotalSubjectStringLen
+= (double)(s
.length() - startOffset
);
449 ASSERT(m_state
!= ParseError
);
450 compileIfNecessaryMatchOnly(vm
, s
.is8Bit() ? Yarr::Char8
: Yarr::Char16
);
453 if (m_state
== JITCode
) {
454 MatchResult result
= s
.is8Bit() ?
455 m_regExpJITCode
.execute(s
.characters8(), startOffset
, s
.length()) :
456 m_regExpJITCode
.execute(s
.characters16(), startOffset
, s
.length());
457 #if ENABLE(REGEXP_TRACING)
459 m_rtMatchOnlyFoundCount
++;
465 int offsetVectorSize
= (m_numSubpatterns
+ 1) * 2;
467 Vector
<int, 32> nonReturnedOvector
;
468 nonReturnedOvector
.resize(offsetVectorSize
);
469 offsetVector
= nonReturnedOvector
.data();
470 int r
= Yarr::interpret(m_regExpBytecode
.get(), s
, startOffset
, reinterpret_cast<unsigned*>(offsetVector
));
471 #if REGEXP_FUNC_TEST_DATA_GEN
472 RegExpFunctionalTestCollector::get()->outputOneTest(this, s
, startOffset
, offsetVector
, result
);
476 #if ENABLE(REGEXP_TRACING)
477 m_rtMatchOnlyFoundCount
++;
479 return MatchResult(r
, reinterpret_cast<unsigned*>(offsetVector
)[1]);
482 return MatchResult::failed();
485 void RegExp::invalidateCode()
489 m_state
= NotCompiled
;
491 m_regExpJITCode
.clear();
493 m_regExpBytecode
= nullptr;
496 #if ENABLE(YARR_JIT_DEBUG)
497 void RegExp::matchCompareWithInterpreter(const String
& s
, int startOffset
, int* offsetVector
, int jitResult
)
499 int offsetVectorSize
= (m_numSubpatterns
+ 1) * 2;
500 Vector
<int, 32> interpreterOvector
;
501 interpreterOvector
.resize(offsetVectorSize
);
502 int* interpreterOffsetVector
= interpreterOvector
.data();
503 int interpreterResult
= 0;
506 // Initialize interpreterOffsetVector with the return value (index 0) and the
507 // first subpattern start indicies (even index values) set to -1.
508 // No need to init the subpattern end indicies.
509 for (unsigned j
= 0, i
= 0; i
< m_numSubpatterns
+ 1; j
+= 2, i
++)
510 interpreterOffsetVector
[j
] = -1;
512 interpreterResult
= Yarr::interpret(m_regExpBytecode
.get(), s
, startOffset
, interpreterOffsetVector
);
514 if (jitResult
!= interpreterResult
)
517 for (unsigned j
= 2, i
= 0; i
< m_numSubpatterns
; j
+=2, i
++)
518 if ((offsetVector
[j
] != interpreterOffsetVector
[j
])
519 || ((offsetVector
[j
] >= 0) && (offsetVector
[j
+1] != interpreterOffsetVector
[j
+1])))
523 dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data());
524 unsigned segmentLen
= s
.length() - static_cast<unsigned>(startOffset
);
526 dataLogF((segmentLen
< 150) ? "\"%s\"\n" : "\"%148s...\"\n", s
.utf8().data() + startOffset
);
528 if (jitResult
!= interpreterResult
) {
529 dataLogF(" JIT result = %d, blah interpreted result = %d\n", jitResult
, interpreterResult
);
532 dataLogF(" Correct result = %d\n", jitResult
);
536 for (unsigned j
= 2, i
= 0; i
< m_numSubpatterns
; j
+=2, i
++) {
537 if (offsetVector
[j
] != interpreterOffsetVector
[j
])
538 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j
, offsetVector
[j
], j
, interpreterOffsetVector
[j
]);
539 if ((offsetVector
[j
] >= 0) && (offsetVector
[j
+1] != interpreterOffsetVector
[j
+1]))
540 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j
+1, offsetVector
[j
+1], j
+1, interpreterOffsetVector
[j
+1]);
547 #if ENABLE(REGEXP_TRACING)
548 void RegExp::printTraceData()
550 char formattedPattern
[41];
553 strncpy(rawPattern
, pattern().utf8().data(), 40);
554 rawPattern
[40]= '\0';
556 int pattLen
= strlen(rawPattern
);
558 snprintf(formattedPattern
, 41, (pattLen
<= 38) ? "/%.38s/" : "/%.36s...", rawPattern
);
561 Yarr::YarrCodeBlock
& codeBlock
= m_regExpJITCode
;
563 const size_t jitAddrSize
= 20;
564 char jit8BitMatchOnlyAddr
[jitAddrSize
];
565 char jit16BitMatchOnlyAddr
[jitAddrSize
];
566 char jit8BitMatchAddr
[jitAddrSize
];
567 char jit16BitMatchAddr
[jitAddrSize
];
568 if (m_state
== ByteCode
) {
569 snprintf(jit8BitMatchOnlyAddr
, jitAddrSize
, "fallback ");
570 snprintf(jit16BitMatchOnlyAddr
, jitAddrSize
, "---- ");
571 snprintf(jit8BitMatchAddr
, jitAddrSize
, "fallback ");
572 snprintf(jit16BitMatchAddr
, jitAddrSize
, "---- ");
574 snprintf(jit8BitMatchOnlyAddr
, jitAddrSize
, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock
.get8BitMatchOnlyAddr()));
575 snprintf(jit16BitMatchOnlyAddr
, jitAddrSize
, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock
.get16BitMatchOnlyAddr()));
576 snprintf(jit8BitMatchAddr
, jitAddrSize
, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock
.get8BitMatchAddr()));
577 snprintf(jit16BitMatchAddr
, jitAddrSize
, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock
.get16BitMatchAddr()));
580 const char* jit8BitMatchOnlyAddr
= "JIT Off";
581 const char* jit16BitMatchOnlyAddr
= "";
582 const char* jit8BitMatchAddr
= "JIT Off";
583 const char* jit16BitMatchAddr
= "";
585 unsigned averageMatchOnlyStringLen
= (unsigned)(m_rtMatchOnlyTotalSubjectStringLen
/ m_rtMatchOnlyCallCount
);
586 unsigned averageMatchStringLen
= (unsigned)(m_rtMatchTotalSubjectStringLen
/ m_rtMatchCallCount
);
588 printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n", formattedPattern
, jit8BitMatchOnlyAddr
, jit16BitMatchOnlyAddr
, m_rtMatchOnlyCallCount
, m_rtMatchOnlyFoundCount
, averageMatchOnlyStringLen
);
589 printf(" %16.16s %16.16s %10d %10d %10u\n", jit8BitMatchAddr
, jit16BitMatchAddr
, m_rtMatchCallCount
, m_rtMatchFoundCount
, averageMatchStringLen
);