2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 * Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
4 * Copyright (C) 2009 Torch Mobile, Inc.
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 #include "JSCInlines.h"
28 #include "RegExpCache.h"
31 #include <wtf/Assertions.h>
33 #define REGEXP_FUNC_TEST_DATA_GEN 0
35 #if REGEXP_FUNC_TEST_DATA_GEN
43 const ClassInfo
RegExp::s_info
= { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp
) };
45 RegExpFlags
regExpFlags(const String
& string
)
47 RegExpFlags flags
= NoFlags
;
49 for (unsigned i
= 0; i
< string
.length(); ++i
) {
52 if (flags
& FlagGlobal
)
54 flags
= static_cast<RegExpFlags
>(flags
| FlagGlobal
);
58 if (flags
& FlagIgnoreCase
)
60 flags
= static_cast<RegExpFlags
>(flags
| FlagIgnoreCase
);
64 if (flags
& FlagMultiline
)
66 flags
= static_cast<RegExpFlags
>(flags
| FlagMultiline
);
77 #if REGEXP_FUNC_TEST_DATA_GEN
78 class RegExpFunctionalTestCollector
{
79 // This class is not thread safe.
81 static const char* const s_fileName
;
84 static RegExpFunctionalTestCollector
* get();
86 ~RegExpFunctionalTestCollector();
88 void outputOneTest(RegExp
*, String
, int, int*, int);
89 void clearRegExp(RegExp
* regExp
)
91 if (regExp
== m_lastRegExp
)
96 RegExpFunctionalTestCollector();
98 void outputEscapedString(const String
&, bool escapeSlash
= false);
100 static RegExpFunctionalTestCollector
* s_instance
;
102 RegExp
* m_lastRegExp
;
105 const char* const RegExpFunctionalTestCollector::s_fileName
= "/tmp/RegExpTestsData";
106 RegExpFunctionalTestCollector
* RegExpFunctionalTestCollector::s_instance
= 0;
108 RegExpFunctionalTestCollector
* RegExpFunctionalTestCollector::get()
111 s_instance
= new RegExpFunctionalTestCollector();
116 void RegExpFunctionalTestCollector::outputOneTest(RegExp
* regExp
, String s
, int startOffset
, int* ovector
, int result
)
118 if ((!m_lastRegExp
) || (m_lastRegExp
!= regExp
)) {
119 m_lastRegExp
= regExp
;
121 outputEscapedString(regExp
->pattern(), true);
123 if (regExp
->global())
125 if (regExp
->ignoreCase())
127 if (regExp
->multiline())
129 fprintf(m_file
, "\n");
132 fprintf(m_file
, " \"");
133 outputEscapedString(s
);
134 fprintf(m_file
, "\", %d, %d, (", startOffset
, result
);
135 for (unsigned i
= 0; i
<= regExp
->numSubpatterns(); i
++) {
136 int subpatternBegin
= ovector
[i
* 2];
137 int subpatternEnd
= ovector
[i
* 2 + 1];
138 if (subpatternBegin
== -1)
140 fprintf(m_file
, "%d, %d", subpatternBegin
, subpatternEnd
);
141 if (i
< regExp
->numSubpatterns())
145 fprintf(m_file
, ")\n");
149 RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
151 m_file
= fopen(s_fileName
, "r+");
153 m_file
= fopen(s_fileName
, "w+");
155 fseek(m_file
, 0L, SEEK_END
);
158 RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
164 void RegExpFunctionalTestCollector::outputEscapedString(const String
& s
, bool escapeSlash
)
166 int len
= s
.length();
168 for (int i
= 0; i
< len
; ++i
) {
173 fputs("\\0", m_file
);
176 fputs("\\a", m_file
);
179 fputs("\\b", m_file
);
182 fputs("\\f", m_file
);
185 fputs("\\n", m_file
);
188 fputs("\\r", m_file
);
191 fputs("\\t", m_file
);
194 fputs("\\v", m_file
);
198 fputs("\\/", m_file
);
203 fputs("\\\"", m_file
);
206 fputs("\\\\", m_file
);
213 fprintf(m_file
, "\\u%04x", c
);
222 RegExp::RegExp(VM
& vm
, const String
& patternString
, RegExpFlags flags
)
223 : JSCell(vm
, vm
.regExpStructure
.get())
224 , m_state(NotCompiled
)
225 , m_patternString(patternString
)
227 , m_constructionError(0)
228 , m_numSubpatterns(0)
229 #if ENABLE(REGEXP_TRACING)
230 , m_rtMatchOnlyTotalSubjectStringLen(0.0)
231 , m_rtMatchTotalSubjectStringLen(0.0)
232 , m_rtMatchOnlyCallCount(0)
233 , m_rtMatchOnlyFoundCount(0)
234 , m_rtMatchCallCount(0)
235 , m_rtMatchFoundCount(0)
240 void RegExp::finishCreation(VM
& vm
)
242 Base::finishCreation(vm
);
243 Yarr::YarrPattern
pattern(m_patternString
, ignoreCase(), multiline(), &m_constructionError
);
244 if (m_constructionError
)
245 m_state
= ParseError
;
247 m_numSubpatterns
= pattern
.m_numSubpatterns
;
250 void RegExp::destroy(JSCell
* cell
)
252 RegExp
* thisObject
= static_cast<RegExp
*>(cell
);
253 #if REGEXP_FUNC_TEST_DATA_GEN
254 RegExpFunctionalTestCollector::get()->clearRegExp(this);
256 thisObject
->RegExp::~RegExp();
259 RegExp
* RegExp::createWithoutCaching(VM
& vm
, const String
& patternString
, RegExpFlags flags
)
261 RegExp
* regExp
= new (NotNull
, allocateCell
<RegExp
>(vm
.heap
)) RegExp(vm
, patternString
, flags
);
262 regExp
->finishCreation(vm
);
266 RegExp
* RegExp::create(VM
& vm
, const String
& patternString
, RegExpFlags flags
)
268 return vm
.regExpCache()->lookupOrCreate(patternString
, flags
);
271 void RegExp::compile(VM
* vm
, Yarr::YarrCharSize charSize
)
273 Yarr::YarrPattern
pattern(m_patternString
, ignoreCase(), multiline(), &m_constructionError
);
274 if (m_constructionError
) {
275 RELEASE_ASSERT_NOT_REACHED();
276 m_state
= ParseError
;
279 ASSERT(m_numSubpatterns
== pattern
.m_numSubpatterns
);
282 ASSERT(m_state
== NotCompiled
);
283 vm
->regExpCache()->addToStrongCache(this);
288 if (!pattern
.m_containsBackreferences
&& !pattern
.containsUnsignedLengthPattern() && vm
->canUseRegExpJIT()) {
289 Yarr::jitCompile(pattern
, charSize
, vm
, m_regExpJITCode
);
290 #if ENABLE(YARR_JIT_DEBUG)
291 if (!m_regExpJITCode
.isFallBack())
296 if (!m_regExpJITCode
.isFallBack()) {
303 UNUSED_PARAM(charSize
);
306 m_regExpBytecode
= Yarr::byteCompile(pattern
, &vm
->m_regExpAllocator
);
309 void RegExp::compileIfNecessary(VM
& vm
, Yarr::YarrCharSize charSize
)
313 if (m_state
!= JITCode
)
315 if ((charSize
== Yarr::Char8
) && (m_regExpJITCode
.has8BitCode()))
317 if ((charSize
== Yarr::Char16
) && (m_regExpJITCode
.has16BitCode()))
324 compile(&vm
, charSize
);
327 int RegExp::match(VM
& vm
, const String
& s
, unsigned startOffset
, Vector
<int, 32>& ovector
)
329 #if ENABLE(REGEXP_TRACING)
330 m_rtMatchCallCount
++;
331 m_rtMatchTotalSubjectStringLen
+= (double)(s
.length() - startOffset
);
334 ASSERT(m_state
!= ParseError
);
335 compileIfNecessary(vm
, s
.is8Bit() ? Yarr::Char8
: Yarr::Char16
);
337 int offsetVectorSize
= (m_numSubpatterns
+ 1) * 2;
338 ovector
.resize(offsetVectorSize
);
339 int* offsetVector
= ovector
.data();
343 if (m_state
== JITCode
) {
345 result
= m_regExpJITCode
.execute(s
.characters8(), startOffset
, s
.length(), offsetVector
).start
;
347 result
= m_regExpJITCode
.execute(s
.characters16(), startOffset
, s
.length(), offsetVector
).start
;
348 #if ENABLE(YARR_JIT_DEBUG)
349 matchCompareWithInterpreter(s
, startOffset
, offsetVector
, result
);
353 result
= Yarr::interpret(m_regExpBytecode
.get(), s
, startOffset
, reinterpret_cast<unsigned*>(offsetVector
));
355 // FIXME: The YARR engine should handle unsigned or size_t length matches.
356 // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed.
357 // The offset vector handling needs to change as well.
358 // Right now we convert a match where the offsets overflowed into match failure.
359 // There are two places in WebCore that call the interpreter directly that need to
360 // have their offsets changed to int as well. They are yarr/RegularExpression.cpp
361 // and inspector/ContentSearchUtilities.cpp
362 if (s
.length() > INT_MAX
) {
363 bool overflowed
= false;
368 for (unsigned i
= 0; i
<= m_numSubpatterns
; i
++) {
369 if ((offsetVector
[i
*2] < -1) || ((offsetVector
[i
*2] >= 0) && (offsetVector
[i
*2+1] < -1))) {
371 offsetVector
[i
*2] = -1;
372 offsetVector
[i
*2+1] = -1;
380 ASSERT(result
>= -1);
382 #if REGEXP_FUNC_TEST_DATA_GEN
383 RegExpFunctionalTestCollector::get()->outputOneTest(this, s
, startOffset
, offsetVector
, result
);
386 #if ENABLE(REGEXP_TRACING)
388 m_rtMatchFoundCount
++;
394 void RegExp::compileMatchOnly(VM
* vm
, Yarr::YarrCharSize charSize
)
396 Yarr::YarrPattern
pattern(m_patternString
, ignoreCase(), multiline(), &m_constructionError
);
397 if (m_constructionError
) {
398 RELEASE_ASSERT_NOT_REACHED();
399 m_state
= ParseError
;
402 ASSERT(m_numSubpatterns
== pattern
.m_numSubpatterns
);
405 ASSERT(m_state
== NotCompiled
);
406 vm
->regExpCache()->addToStrongCache(this);
411 if (!pattern
.m_containsBackreferences
&& !pattern
.containsUnsignedLengthPattern() && vm
->canUseRegExpJIT()) {
412 Yarr::jitCompile(pattern
, charSize
, vm
, m_regExpJITCode
, Yarr::MatchOnly
);
413 #if ENABLE(YARR_JIT_DEBUG)
414 if (!m_regExpJITCode
.isFallBack())
419 if (!m_regExpJITCode
.isFallBack()) {
426 UNUSED_PARAM(charSize
);
429 m_regExpBytecode
= Yarr::byteCompile(pattern
, &vm
->m_regExpAllocator
);
432 void RegExp::compileIfNecessaryMatchOnly(VM
& vm
, Yarr::YarrCharSize charSize
)
436 if (m_state
!= JITCode
)
438 if ((charSize
== Yarr::Char8
) && (m_regExpJITCode
.has8BitCodeMatchOnly()))
440 if ((charSize
== Yarr::Char16
) && (m_regExpJITCode
.has16BitCodeMatchOnly()))
447 compileMatchOnly(&vm
, charSize
);
450 MatchResult
RegExp::match(VM
& vm
, const String
& s
, unsigned startOffset
)
452 #if ENABLE(REGEXP_TRACING)
453 m_rtMatchOnlyCallCount
++;
454 m_rtMatchOnlyTotalSubjectStringLen
+= (double)(s
.length() - startOffset
);
457 ASSERT(m_state
!= ParseError
);
458 compileIfNecessaryMatchOnly(vm
, s
.is8Bit() ? Yarr::Char8
: Yarr::Char16
);
461 if (m_state
== JITCode
) {
462 MatchResult result
= s
.is8Bit() ?
463 m_regExpJITCode
.execute(s
.characters8(), startOffset
, s
.length()) :
464 m_regExpJITCode
.execute(s
.characters16(), startOffset
, s
.length());
465 #if ENABLE(REGEXP_TRACING)
467 m_rtMatchOnlyFoundCount
++;
473 int offsetVectorSize
= (m_numSubpatterns
+ 1) * 2;
475 Vector
<int, 32> nonReturnedOvector
;
476 nonReturnedOvector
.resize(offsetVectorSize
);
477 offsetVector
= nonReturnedOvector
.data();
478 int r
= Yarr::interpret(m_regExpBytecode
.get(), s
, startOffset
, reinterpret_cast<unsigned*>(offsetVector
));
479 #if REGEXP_FUNC_TEST_DATA_GEN
480 RegExpFunctionalTestCollector::get()->outputOneTest(this, s
, startOffset
, offsetVector
, result
);
484 #if ENABLE(REGEXP_TRACING)
485 m_rtMatchOnlyFoundCount
++;
487 return MatchResult(r
, reinterpret_cast<unsigned*>(offsetVector
)[1]);
490 return MatchResult::failed();
493 void RegExp::invalidateCode()
497 m_state
= NotCompiled
;
499 m_regExpJITCode
.clear();
501 m_regExpBytecode
.clear();
504 #if ENABLE(YARR_JIT_DEBUG)
505 void RegExp::matchCompareWithInterpreter(const String
& s
, int startOffset
, int* offsetVector
, int jitResult
)
507 int offsetVectorSize
= (m_numSubpatterns
+ 1) * 2;
508 Vector
<int, 32> interpreterOvector
;
509 interpreterOvector
.resize(offsetVectorSize
);
510 int* interpreterOffsetVector
= interpreterOvector
.data();
511 int interpreterResult
= 0;
514 // Initialize interpreterOffsetVector with the return value (index 0) and the
515 // first subpattern start indicies (even index values) set to -1.
516 // No need to init the subpattern end indicies.
517 for (unsigned j
= 0, i
= 0; i
< m_numSubpatterns
+ 1; j
+= 2, i
++)
518 interpreterOffsetVector
[j
] = -1;
520 interpreterResult
= Yarr::interpret(m_regExpBytecode
.get(), s
, startOffset
, interpreterOffsetVector
);
522 if (jitResult
!= interpreterResult
)
525 for (unsigned j
= 2, i
= 0; i
< m_numSubpatterns
; j
+=2, i
++)
526 if ((offsetVector
[j
] != interpreterOffsetVector
[j
])
527 || ((offsetVector
[j
] >= 0) && (offsetVector
[j
+1] != interpreterOffsetVector
[j
+1])))
531 dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data());
532 unsigned segmentLen
= s
.length() - static_cast<unsigned>(startOffset
);
534 dataLogF((segmentLen
< 150) ? "\"%s\"\n" : "\"%148s...\"\n", s
.utf8().data() + startOffset
);
536 if (jitResult
!= interpreterResult
) {
537 dataLogF(" JIT result = %d, blah interpreted result = %d\n", jitResult
, interpreterResult
);
540 dataLogF(" Correct result = %d\n", jitResult
);
544 for (unsigned j
= 2, i
= 0; i
< m_numSubpatterns
; j
+=2, i
++) {
545 if (offsetVector
[j
] != interpreterOffsetVector
[j
])
546 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j
, offsetVector
[j
], j
, interpreterOffsetVector
[j
]);
547 if ((offsetVector
[j
] >= 0) && (offsetVector
[j
+1] != interpreterOffsetVector
[j
+1]))
548 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j
+1, offsetVector
[j
+1], j
+1, interpreterOffsetVector
[j
+1]);
555 #if ENABLE(REGEXP_TRACING)
556 void RegExp::printTraceData()
558 char formattedPattern
[41];
561 strncpy(rawPattern
, pattern().utf8().data(), 40);
562 rawPattern
[40]= '\0';
564 int pattLen
= strlen(rawPattern
);
566 snprintf(formattedPattern
, 41, (pattLen
<= 38) ? "/%.38s/" : "/%.36s...", rawPattern
);
569 Yarr::YarrCodeBlock
& codeBlock
= m_regExpJITCode
;
571 const size_t jitAddrSize
= 20;
572 char jit8BitMatchOnlyAddr
[jitAddrSize
];
573 char jit16BitMatchOnlyAddr
[jitAddrSize
];
574 char jit8BitMatchAddr
[jitAddrSize
];
575 char jit16BitMatchAddr
[jitAddrSize
];
576 if (m_state
== ByteCode
) {
577 snprintf(jit8BitMatchOnlyAddr
, jitAddrSize
, "fallback ");
578 snprintf(jit16BitMatchOnlyAddr
, jitAddrSize
, "---- ");
579 snprintf(jit8BitMatchAddr
, jitAddrSize
, "fallback ");
580 snprintf(jit16BitMatchAddr
, jitAddrSize
, "---- ");
582 snprintf(jit8BitMatchOnlyAddr
, jitAddrSize
, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock
.get8BitMatchOnlyAddr()));
583 snprintf(jit16BitMatchOnlyAddr
, jitAddrSize
, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock
.get16BitMatchOnlyAddr()));
584 snprintf(jit8BitMatchAddr
, jitAddrSize
, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock
.get8BitMatchAddr()));
585 snprintf(jit16BitMatchAddr
, jitAddrSize
, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock
.get16BitMatchAddr()));
588 const char* jit8BitMatchOnlyAddr
= "JIT Off";
589 const char* jit16BitMatchOnlyAddr
= "";
590 const char* jit8BitMatchAddr
= "JIT Off";
591 const char* jit16BitMatchAddr
= "";
593 unsigned averageMatchOnlyStringLen
= (unsigned)(m_rtMatchOnlyTotalSubjectStringLen
/ m_rtMatchOnlyCallCount
);
594 unsigned averageMatchStringLen
= (unsigned)(m_rtMatchTotalSubjectStringLen
/ m_rtMatchCallCount
);
596 printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n", formattedPattern
, jit8BitMatchOnlyAddr
, jit16BitMatchOnlyAddr
, m_rtMatchOnlyCallCount
, m_rtMatchOnlyFoundCount
, averageMatchOnlyStringLen
);
597 printf(" %16.16s %16.16s %10d %10d %10u\n", jit8BitMatchAddr
, jit16BitMatchAddr
, m_rtMatchCallCount
, m_rtMatchFoundCount
, averageMatchStringLen
);