2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 * Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
4 * Copyright (C) 2009 Torch Mobile, Inc.
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 #include "RegExpCache.h"
28 #include "yarr/Yarr.h"
29 #include "yarr/YarrJIT.h"
33 #include <wtf/Assertions.h>
34 #include <wtf/OwnArrayPtr.h>
37 #define REGEXP_FUNC_TEST_DATA_GEN 0
41 const ClassInfo
RegExp::s_info
= { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp
) };
43 RegExpFlags
regExpFlags(const UString
& string
)
45 RegExpFlags flags
= NoFlags
;
47 for (unsigned i
= 0; i
< string
.length(); ++i
) {
50 if (flags
& FlagGlobal
)
52 flags
= static_cast<RegExpFlags
>(flags
| FlagGlobal
);
56 if (flags
& FlagIgnoreCase
)
58 flags
= static_cast<RegExpFlags
>(flags
| FlagIgnoreCase
);
62 if (flags
& FlagMultiline
)
64 flags
= static_cast<RegExpFlags
>(flags
| FlagMultiline
);
75 #if REGEXP_FUNC_TEST_DATA_GEN
76 class RegExpFunctionalTestCollector
{
77 // This class is not thread safe.
79 static const char* const s_fileName
;
82 static RegExpFunctionalTestCollector
* get();
84 ~RegExpFunctionalTestCollector();
86 void outputOneTest(RegExp
*, UString
, int, int*, int);
87 void clearRegExp(RegExp
* regExp
)
89 if (regExp
== m_lastRegExp
)
94 RegExpFunctionalTestCollector();
96 void outputEscapedUString(const UString
&, bool escapeSlash
= false);
98 static RegExpFunctionalTestCollector
* s_instance
;
100 RegExp
* m_lastRegExp
;
103 const char* const RegExpFunctionalTestCollector::s_fileName
= "/tmp/RegExpTestsData";
104 RegExpFunctionalTestCollector
* RegExpFunctionalTestCollector::s_instance
= 0;
106 RegExpFunctionalTestCollector
* RegExpFunctionalTestCollector::get()
109 s_instance
= new RegExpFunctionalTestCollector();
114 void RegExpFunctionalTestCollector::outputOneTest(RegExp
* regExp
, UString s
, int startOffset
, int* ovector
, int result
)
116 if ((!m_lastRegExp
) || (m_lastRegExp
!= regExp
)) {
117 m_lastRegExp
= regExp
;
119 outputEscapedUString(regExp
->pattern(), true);
121 if (regExp
->global())
123 if (regExp
->ignoreCase())
125 if (regExp
->multiline())
127 fprintf(m_file
, "\n");
130 fprintf(m_file
, " \"");
131 outputEscapedUString(s
);
132 fprintf(m_file
, "\", %d, %d, (", startOffset
, result
);
133 for (unsigned i
= 0; i
<= regExp
->numSubpatterns(); i
++) {
134 int subpatternBegin
= ovector
[i
* 2];
135 int subpatternEnd
= ovector
[i
* 2 + 1];
136 if (subpatternBegin
== -1)
138 fprintf(m_file
, "%d, %d", subpatternBegin
, subpatternEnd
);
139 if (i
< regExp
->numSubpatterns())
143 fprintf(m_file
, ")\n");
147 RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
149 m_file
= fopen(s_fileName
, "r+");
151 m_file
= fopen(s_fileName
, "w+");
153 fseek(m_file
, 0L, SEEK_END
);
156 RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
162 void RegExpFunctionalTestCollector::outputEscapedUString(const UString
& s
, bool escapeSlash
)
164 int len
= s
.length();
166 for (int i
= 0; i
< len
; ++i
) {
171 fputs("\\0", m_file
);
174 fputs("\\a", m_file
);
177 fputs("\\b", m_file
);
180 fputs("\\f", m_file
);
183 fputs("\\n", m_file
);
186 fputs("\\r", m_file
);
189 fputs("\\t", m_file
);
192 fputs("\\v", m_file
);
196 fputs("\\/", m_file
);
201 fputs("\\\"", m_file
);
204 fputs("\\\\", m_file
);
211 fprintf(m_file
, "\\u%04x", c
);
220 RegExp::RegExp(JSGlobalData
& globalData
, const UString
& patternString
, RegExpFlags flags
)
221 : JSCell(globalData
, globalData
.regExpStructure
.get())
222 , m_state(NotCompiled
)
223 , m_patternString(patternString
)
225 , m_constructionError(0)
226 , m_numSubpatterns(0)
227 #if ENABLE(REGEXP_TRACING)
228 , m_rtMatchCallCount(0)
229 , m_rtMatchFoundCount(0)
234 void RegExp::finishCreation(JSGlobalData
& globalData
)
236 Base::finishCreation(globalData
);
237 Yarr::YarrPattern
pattern(m_patternString
, ignoreCase(), multiline(), &m_constructionError
);
238 if (m_constructionError
)
239 m_state
= ParseError
;
241 m_numSubpatterns
= pattern
.m_numSubpatterns
;
244 void RegExp::destroy(JSCell
* cell
)
246 RegExp
* thisObject
= jsCast
<RegExp
*>(cell
);
247 #if REGEXP_FUNC_TEST_DATA_GEN
248 RegExpFunctionalTestCollector::get()->clearRegExp(this);
250 thisObject
->RegExp::~RegExp();
253 RegExp
* RegExp::createWithoutCaching(JSGlobalData
& globalData
, const UString
& patternString
, RegExpFlags flags
)
255 RegExp
* regExp
= new (NotNull
, allocateCell
<RegExp
>(globalData
.heap
)) RegExp(globalData
, patternString
, flags
);
256 regExp
->finishCreation(globalData
);
260 RegExp
* RegExp::create(JSGlobalData
& globalData
, const UString
& patternString
, RegExpFlags flags
)
262 return globalData
.regExpCache()->lookupOrCreate(patternString
, flags
);
265 void RegExp::compile(JSGlobalData
* globalData
, Yarr::YarrCharSize charSize
)
267 Yarr::YarrPattern
pattern(m_patternString
, ignoreCase(), multiline(), &m_constructionError
);
268 if (m_constructionError
) {
269 ASSERT_NOT_REACHED();
270 m_state
= ParseError
;
273 ASSERT(m_numSubpatterns
== pattern
.m_numSubpatterns
);
276 ASSERT(m_state
== NotCompiled
);
277 globalData
->regExpCache()->addToStrongCache(this);
282 if (!pattern
.m_containsBackreferences
&& globalData
->canUseRegExpJIT()) {
283 Yarr::jitCompile(pattern
, charSize
, globalData
, m_regExpJITCode
);
284 #if ENABLE(YARR_JIT_DEBUG)
285 if (!m_regExpJITCode
.isFallBack())
290 if (!m_regExpJITCode
.isFallBack()) {
297 UNUSED_PARAM(charSize
);
300 m_regExpBytecode
= Yarr::byteCompile(pattern
, &globalData
->m_regExpAllocator
);
303 void RegExp::compileIfNecessary(JSGlobalData
& globalData
, Yarr::YarrCharSize charSize
)
307 if (m_state
!= JITCode
)
309 if ((charSize
== Yarr::Char8
) && (m_regExpJITCode
.has8BitCode()))
311 if ((charSize
== Yarr::Char16
) && (m_regExpJITCode
.has16BitCode()))
318 compile(&globalData
, charSize
);
321 int RegExp::match(JSGlobalData
& globalData
, const UString
& s
, unsigned startOffset
, Vector
<int, 32>& ovector
)
323 #if ENABLE(REGEXP_TRACING)
324 m_rtMatchCallCount
++;
327 ASSERT(m_state
!= ParseError
);
328 compileIfNecessary(globalData
, s
.is8Bit() ? Yarr::Char8
: Yarr::Char16
);
330 int offsetVectorSize
= (m_numSubpatterns
+ 1) * 2;
331 ovector
.resize(offsetVectorSize
);
332 int* offsetVector
= ovector
.data();
336 if (m_state
== JITCode
) {
338 result
= m_regExpJITCode
.execute(s
.characters8(), startOffset
, s
.length(), offsetVector
).start
;
340 result
= m_regExpJITCode
.execute(s
.characters16(), startOffset
, s
.length(), offsetVector
).start
;
341 #if ENABLE(YARR_JIT_DEBUG)
342 matchCompareWithInterpreter(s
, startOffset
, offsetVector
, result
);
346 result
= Yarr::interpret(m_regExpBytecode
.get(), s
, startOffset
, reinterpret_cast<unsigned*>(offsetVector
));
348 // FIXME: The YARR engine should handle unsigned or size_t length matches.
349 // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed.
350 // The offset vector handling needs to change as well.
351 // Right now we convert a match where the offsets overflowed into match failure.
352 // There are two places in WebCore that call the interpreter directly that need to
353 // have their offsets changed to int as well. They are platform/text/RegularExpression.cpp
354 // and inspector/ContentSearchUtils.cpp.
355 if (s
.length() > INT_MAX
) {
356 bool overflowed
= false;
361 for (unsigned i
= 0; i
<= m_numSubpatterns
; i
++) {
362 if ((offsetVector
[i
*2] < -1) || ((offsetVector
[i
*2] >= 0) && (offsetVector
[i
*2+1] < -1))) {
364 offsetVector
[i
*2] = -1;
365 offsetVector
[i
*2+1] = -1;
373 ASSERT(result
>= -1);
375 #if REGEXP_FUNC_TEST_DATA_GEN
376 RegExpFunctionalTestCollector::get()->outputOneTest(this, s
, startOffset
, offsetVector
, result
);
379 #if ENABLE(REGEXP_TRACING)
381 m_rtMatchFoundCount
++;
387 void RegExp::compileMatchOnly(JSGlobalData
* globalData
, Yarr::YarrCharSize charSize
)
389 Yarr::YarrPattern
pattern(m_patternString
, ignoreCase(), multiline(), &m_constructionError
);
390 if (m_constructionError
) {
391 ASSERT_NOT_REACHED();
392 m_state
= ParseError
;
395 ASSERT(m_numSubpatterns
== pattern
.m_numSubpatterns
);
398 ASSERT(m_state
== NotCompiled
);
399 globalData
->regExpCache()->addToStrongCache(this);
404 if (!pattern
.m_containsBackreferences
&& globalData
->canUseRegExpJIT()) {
405 Yarr::jitCompile(pattern
, charSize
, globalData
, m_regExpJITCode
, Yarr::MatchOnly
);
406 #if ENABLE(YARR_JIT_DEBUG)
407 if (!m_regExpJITCode
.isFallBack())
412 if (!m_regExpJITCode
.isFallBack()) {
419 UNUSED_PARAM(charSize
);
422 m_regExpBytecode
= Yarr::byteCompile(pattern
, &globalData
->m_regExpAllocator
);
425 void RegExp::compileIfNecessaryMatchOnly(JSGlobalData
& globalData
, Yarr::YarrCharSize charSize
)
429 if (m_state
!= JITCode
)
431 if ((charSize
== Yarr::Char8
) && (m_regExpJITCode
.has8BitCodeMatchOnly()))
433 if ((charSize
== Yarr::Char16
) && (m_regExpJITCode
.has16BitCodeMatchOnly()))
440 compileMatchOnly(&globalData
, charSize
);
443 MatchResult
RegExp::match(JSGlobalData
& globalData
, const UString
& s
, unsigned startOffset
)
445 #if ENABLE(REGEXP_TRACING)
446 m_rtMatchCallCount
++;
449 ASSERT(m_state
!= ParseError
);
450 compileIfNecessaryMatchOnly(globalData
, s
.is8Bit() ? Yarr::Char8
: Yarr::Char16
);
453 if (m_state
== JITCode
) {
454 MatchResult result
= s
.is8Bit() ?
455 m_regExpJITCode
.execute(s
.characters8(), startOffset
, s
.length()) :
456 m_regExpJITCode
.execute(s
.characters16(), startOffset
, s
.length());
457 #if ENABLE(REGEXP_TRACING)
459 m_rtMatchFoundCount
++;
465 int offsetVectorSize
= (m_numSubpatterns
+ 1) * 2;
467 Vector
<int, 32> nonReturnedOvector
;
468 nonReturnedOvector
.resize(offsetVectorSize
);
469 offsetVector
= nonReturnedOvector
.data();
470 int r
= Yarr::interpret(m_regExpBytecode
.get(), s
, startOffset
, reinterpret_cast<unsigned*>(offsetVector
));
471 #if REGEXP_FUNC_TEST_DATA_GEN
472 RegExpFunctionalTestCollector::get()->outputOneTest(this, s
, startOffset
, offsetVector
, result
);
476 #if ENABLE(REGEXP_TRACING)
477 m_rtMatchFoundCount
++;
479 return MatchResult(r
, reinterpret_cast<unsigned*>(offsetVector
)[1]);
482 return MatchResult::failed();
485 void RegExp::invalidateCode()
489 m_state
= NotCompiled
;
491 m_regExpJITCode
.clear();
493 m_regExpBytecode
.clear();
496 #if ENABLE(YARR_JIT_DEBUG)
497 void RegExp::matchCompareWithInterpreter(const UString
& s
, int startOffset
, int* offsetVector
, int jitResult
)
499 int offsetVectorSize
= (m_numSubpatterns
+ 1) * 2;
500 Vector
<int, 32> interpreterOvector
;
501 interpreterOvector
.resize(offsetVectorSize
);
502 int* interpreterOffsetVector
= interpreterOvector
.data();
503 int interpreterResult
= 0;
506 // Initialize interpreterOffsetVector with the return value (index 0) and the
507 // first subpattern start indicies (even index values) set to -1.
508 // No need to init the subpattern end indicies.
509 for (unsigned j
= 0, i
= 0; i
< m_numSubpatterns
+ 1; j
+= 2, i
++)
510 interpreterOffsetVector
[j
] = -1;
512 interpreterResult
= Yarr::interpret(m_regExpBytecode
.get(), s
, startOffset
, interpreterOffsetVector
);
514 if (jitResult
!= interpreterResult
)
517 for (unsigned j
= 2, i
= 0; i
< m_numSubpatterns
; j
+=2, i
++)
518 if ((offsetVector
[j
] != interpreterOffsetVector
[j
])
519 || ((offsetVector
[j
] >= 0) && (offsetVector
[j
+1] != interpreterOffsetVector
[j
+1])))
523 dataLog("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data());
524 unsigned segmentLen
= s
.length() - static_cast<unsigned>(startOffset
);
526 dataLog((segmentLen
< 150) ? "\"%s\"\n" : "\"%148s...\"\n", s
.utf8().data() + startOffset
);
528 if (jitResult
!= interpreterResult
) {
529 dataLog(" JIT result = %d, blah interpreted result = %d\n", jitResult
, interpreterResult
);
532 dataLog(" Correct result = %d\n", jitResult
);
536 for (unsigned j
= 2, i
= 0; i
< m_numSubpatterns
; j
+=2, i
++) {
537 if (offsetVector
[j
] != interpreterOffsetVector
[j
])
538 dataLog(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j
, offsetVector
[j
], j
, interpreterOffsetVector
[j
]);
539 if ((offsetVector
[j
] >= 0) && (offsetVector
[j
+1] != interpreterOffsetVector
[j
+1]))
540 dataLog(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j
+1, offsetVector
[j
+1], j
+1, interpreterOffsetVector
[j
+1]);
547 #if ENABLE(REGEXP_TRACING)
548 void RegExp::printTraceData()
550 char formattedPattern
[41];
553 strncpy(rawPattern
, pattern().utf8().data(), 40);
554 rawPattern
[40]= '\0';
556 int pattLen
= strlen(rawPattern
);
558 snprintf(formattedPattern
, 41, (pattLen
<= 38) ? "/%.38s/" : "/%.36s...", rawPattern
);
561 Yarr::YarrCodeBlock
& codeBlock
= m_regExpJITCode
;
563 const size_t jitAddrSize
= 20;
564 char jitAddr
[jitAddrSize
];
565 if (m_state
== JITCode
)
566 snprintf(jitAddr
, jitAddrSize
, "fallback");
568 snprintf(jitAddr
, jitAddrSize
, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock
.getAddr()));
570 const char* jitAddr
= "JIT Off";
573 printf("%-40.40s %16.16s %10d %10d\n", formattedPattern
, jitAddr
, m_rtMatchCallCount
, m_rtMatchFoundCount
);