]> git.saurik.com Git - apple/javascriptcore.git/blame_incremental - runtime/RegExp.cpp
JavaScriptCore-1218.34.tar.gz
[apple/javascriptcore.git] / runtime / RegExp.cpp
... / ...
CommitLineData
1/*
2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 * Copyright (c) 2007, 2008 Apple Inc. All rights reserved.
4 * Copyright (C) 2009 Torch Mobile, Inc.
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 */
22
23#include "config.h"
24#include "RegExp.h"
25
26#include "Lexer.h"
27#include "Operations.h"
28#include "RegExpCache.h"
29#include "Yarr.h"
30#include "YarrJIT.h"
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34#include <wtf/Assertions.h>
35#include <wtf/OwnArrayPtr.h>
36
37#define REGEXP_FUNC_TEST_DATA_GEN 0
38
39namespace JSC {
40
41const ClassInfo RegExp::s_info = { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp) };
42
43RegExpFlags regExpFlags(const String& string)
44{
45 RegExpFlags flags = NoFlags;
46
47 for (unsigned i = 0; i < string.length(); ++i) {
48 switch (string[i]) {
49 case 'g':
50 if (flags & FlagGlobal)
51 return InvalidFlags;
52 flags = static_cast<RegExpFlags>(flags | FlagGlobal);
53 break;
54
55 case 'i':
56 if (flags & FlagIgnoreCase)
57 return InvalidFlags;
58 flags = static_cast<RegExpFlags>(flags | FlagIgnoreCase);
59 break;
60
61 case 'm':
62 if (flags & FlagMultiline)
63 return InvalidFlags;
64 flags = static_cast<RegExpFlags>(flags | FlagMultiline);
65 break;
66
67 default:
68 return InvalidFlags;
69 }
70 }
71
72 return flags;
73}
74
75#if REGEXP_FUNC_TEST_DATA_GEN
76class RegExpFunctionalTestCollector {
77 // This class is not thread safe.
78protected:
79 static const char* const s_fileName;
80
81public:
82 static RegExpFunctionalTestCollector* get();
83
84 ~RegExpFunctionalTestCollector();
85
86 void outputOneTest(RegExp*, String, int, int*, int);
87 void clearRegExp(RegExp* regExp)
88 {
89 if (regExp == m_lastRegExp)
90 m_lastRegExp = 0;
91 }
92
93private:
94 RegExpFunctionalTestCollector();
95
96 void outputEscapedString(const String&, bool escapeSlash = false);
97
98 static RegExpFunctionalTestCollector* s_instance;
99 FILE* m_file;
100 RegExp* m_lastRegExp;
101};
102
103const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData";
104RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0;
105
106RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get()
107{
108 if (!s_instance)
109 s_instance = new RegExpFunctionalTestCollector();
110
111 return s_instance;
112}
113
114void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, String s, int startOffset, int* ovector, int result)
115{
116 if ((!m_lastRegExp) || (m_lastRegExp != regExp)) {
117 m_lastRegExp = regExp;
118 fputc('/', m_file);
119 outputEscapedString(regExp->pattern(), true);
120 fputc('/', m_file);
121 if (regExp->global())
122 fputc('g', m_file);
123 if (regExp->ignoreCase())
124 fputc('i', m_file);
125 if (regExp->multiline())
126 fputc('m', m_file);
127 fprintf(m_file, "\n");
128 }
129
130 fprintf(m_file, " \"");
131 outputEscapedString(s);
132 fprintf(m_file, "\", %d, %d, (", startOffset, result);
133 for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) {
134 int subpatternBegin = ovector[i * 2];
135 int subpatternEnd = ovector[i * 2 + 1];
136 if (subpatternBegin == -1)
137 subpatternEnd = -1;
138 fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd);
139 if (i < regExp->numSubpatterns())
140 fputs(", ", m_file);
141 }
142
143 fprintf(m_file, ")\n");
144 fflush(m_file);
145}
146
147RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
148{
149 m_file = fopen(s_fileName, "r+");
150 if (!m_file)
151 m_file = fopen(s_fileName, "w+");
152
153 fseek(m_file, 0L, SEEK_END);
154}
155
156RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
157{
158 fclose(m_file);
159 s_instance = 0;
160}
161
162void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash)
163{
164 int len = s.length();
165
166 for (int i = 0; i < len; ++i) {
167 UChar c = s[i];
168
169 switch (c) {
170 case '\0':
171 fputs("\\0", m_file);
172 break;
173 case '\a':
174 fputs("\\a", m_file);
175 break;
176 case '\b':
177 fputs("\\b", m_file);
178 break;
179 case '\f':
180 fputs("\\f", m_file);
181 break;
182 case '\n':
183 fputs("\\n", m_file);
184 break;
185 case '\r':
186 fputs("\\r", m_file);
187 break;
188 case '\t':
189 fputs("\\t", m_file);
190 break;
191 case '\v':
192 fputs("\\v", m_file);
193 break;
194 case '/':
195 if (escapeSlash)
196 fputs("\\/", m_file);
197 else
198 fputs("/", m_file);
199 break;
200 case '\"':
201 fputs("\\\"", m_file);
202 break;
203 case '\\':
204 fputs("\\\\", m_file);
205 break;
206 case '\?':
207 fputs("\?", m_file);
208 break;
209 default:
210 if (c > 0x7f)
211 fprintf(m_file, "\\u%04x", c);
212 else
213 fputc(c, m_file);
214 break;
215 }
216 }
217}
218#endif
219
220RegExp::RegExp(VM& vm, const String& patternString, RegExpFlags flags)
221 : JSCell(vm, vm.regExpStructure.get())
222 , m_state(NotCompiled)
223 , m_patternString(patternString)
224 , m_flags(flags)
225 , m_constructionError(0)
226 , m_numSubpatterns(0)
227#if ENABLE(REGEXP_TRACING)
228 , m_rtMatchCallCount(0)
229 , m_rtMatchFoundCount(0)
230#endif
231{
232}
233
234void RegExp::finishCreation(VM& vm)
235{
236 Base::finishCreation(vm);
237 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
238 if (m_constructionError)
239 m_state = ParseError;
240 else
241 m_numSubpatterns = pattern.m_numSubpatterns;
242}
243
244void RegExp::destroy(JSCell* cell)
245{
246 RegExp* thisObject = static_cast<RegExp*>(cell);
247#if REGEXP_FUNC_TEST_DATA_GEN
248 RegExpFunctionalTestCollector::get()->clearRegExp(this);
249#endif
250 thisObject->RegExp::~RegExp();
251}
252
253RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, RegExpFlags flags)
254{
255 RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags);
256 regExp->finishCreation(vm);
257 return regExp;
258}
259
260RegExp* RegExp::create(VM& vm, const String& patternString, RegExpFlags flags)
261{
262 return vm.regExpCache()->lookupOrCreate(patternString, flags);
263}
264
265void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize)
266{
267 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
268 if (m_constructionError) {
269 RELEASE_ASSERT_NOT_REACHED();
270 m_state = ParseError;
271 return;
272 }
273 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
274
275 if (!hasCode()) {
276 ASSERT(m_state == NotCompiled);
277 vm->regExpCache()->addToStrongCache(this);
278 m_state = ByteCode;
279 }
280
281#if ENABLE(YARR_JIT)
282 if (!pattern.m_containsBackreferences && vm->canUseRegExpJIT()) {
283 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode);
284#if ENABLE(YARR_JIT_DEBUG)
285 if (!m_regExpJITCode.isFallBack())
286 m_state = JITCode;
287 else
288 m_state = ByteCode;
289#else
290 if (!m_regExpJITCode.isFallBack()) {
291 m_state = JITCode;
292 return;
293 }
294#endif
295 }
296#else
297 UNUSED_PARAM(charSize);
298#endif
299
300 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
301}
302
303void RegExp::compileIfNecessary(VM& vm, Yarr::YarrCharSize charSize)
304{
305 if (hasCode()) {
306#if ENABLE(YARR_JIT)
307 if (m_state != JITCode)
308 return;
309 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCode()))
310 return;
311 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCode()))
312 return;
313#else
314 return;
315#endif
316 }
317
318 compile(&vm, charSize);
319}
320
321int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int, 32>& ovector)
322{
323#if ENABLE(REGEXP_TRACING)
324 m_rtMatchCallCount++;
325#endif
326
327 ASSERT(m_state != ParseError);
328 compileIfNecessary(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
329
330 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
331 ovector.resize(offsetVectorSize);
332 int* offsetVector = ovector.data();
333
334 int result;
335#if ENABLE(YARR_JIT)
336 if (m_state == JITCode) {
337 if (s.is8Bit())
338 result = m_regExpJITCode.execute(s.characters8(), startOffset, s.length(), offsetVector).start;
339 else
340 result = m_regExpJITCode.execute(s.characters16(), startOffset, s.length(), offsetVector).start;
341#if ENABLE(YARR_JIT_DEBUG)
342 matchCompareWithInterpreter(s, startOffset, offsetVector, result);
343#endif
344 } else
345#endif
346 result = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
347
348 // FIXME: The YARR engine should handle unsigned or size_t length matches.
349 // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed.
350 // The offset vector handling needs to change as well.
351 // Right now we convert a match where the offsets overflowed into match failure.
352 // There are two places in WebCore that call the interpreter directly that need to
353 // have their offsets changed to int as well. They are platform/text/RegularExpression.cpp
354 // and inspector/ContentSearchUtils.cpp.
355 if (s.length() > INT_MAX) {
356 bool overflowed = false;
357
358 if (result < -1)
359 overflowed = true;
360
361 for (unsigned i = 0; i <= m_numSubpatterns; i++) {
362 if ((offsetVector[i*2] < -1) || ((offsetVector[i*2] >= 0) && (offsetVector[i*2+1] < -1))) {
363 overflowed = true;
364 offsetVector[i*2] = -1;
365 offsetVector[i*2+1] = -1;
366 }
367 }
368
369 if (overflowed)
370 result = -1;
371 }
372
373 ASSERT(result >= -1);
374
375#if REGEXP_FUNC_TEST_DATA_GEN
376 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
377#endif
378
379#if ENABLE(REGEXP_TRACING)
380 if (result != -1)
381 m_rtMatchFoundCount++;
382#endif
383
384 return result;
385}
386
387void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize)
388{
389 Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError);
390 if (m_constructionError) {
391 RELEASE_ASSERT_NOT_REACHED();
392 m_state = ParseError;
393 return;
394 }
395 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
396
397 if (!hasCode()) {
398 ASSERT(m_state == NotCompiled);
399 vm->regExpCache()->addToStrongCache(this);
400 m_state = ByteCode;
401 }
402
403#if ENABLE(YARR_JIT)
404 if (!pattern.m_containsBackreferences && vm->canUseRegExpJIT()) {
405 Yarr::jitCompile(pattern, charSize, vm, m_regExpJITCode, Yarr::MatchOnly);
406#if ENABLE(YARR_JIT_DEBUG)
407 if (!m_regExpJITCode.isFallBack())
408 m_state = JITCode;
409 else
410 m_state = ByteCode;
411#else
412 if (!m_regExpJITCode.isFallBack()) {
413 m_state = JITCode;
414 return;
415 }
416#endif
417 }
418#else
419 UNUSED_PARAM(charSize);
420#endif
421
422 m_regExpBytecode = Yarr::byteCompile(pattern, &vm->m_regExpAllocator);
423}
424
425void RegExp::compileIfNecessaryMatchOnly(VM& vm, Yarr::YarrCharSize charSize)
426{
427 if (hasCode()) {
428#if ENABLE(YARR_JIT)
429 if (m_state != JITCode)
430 return;
431 if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCodeMatchOnly()))
432 return;
433 if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCodeMatchOnly()))
434 return;
435#else
436 return;
437#endif
438 }
439
440 compileMatchOnly(&vm, charSize);
441}
442
443MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset)
444{
445#if ENABLE(REGEXP_TRACING)
446 m_rtMatchCallCount++;
447#endif
448
449 ASSERT(m_state != ParseError);
450 compileIfNecessaryMatchOnly(vm, s.is8Bit() ? Yarr::Char8 : Yarr::Char16);
451
452#if ENABLE(YARR_JIT)
453 if (m_state == JITCode) {
454 MatchResult result = s.is8Bit() ?
455 m_regExpJITCode.execute(s.characters8(), startOffset, s.length()) :
456 m_regExpJITCode.execute(s.characters16(), startOffset, s.length());
457#if ENABLE(REGEXP_TRACING)
458 if (!result)
459 m_rtMatchFoundCount++;
460#endif
461 return result;
462 }
463#endif
464
465 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
466 int* offsetVector;
467 Vector<int, 32> nonReturnedOvector;
468 nonReturnedOvector.resize(offsetVectorSize);
469 offsetVector = nonReturnedOvector.data();
470 int r = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(offsetVector));
471#if REGEXP_FUNC_TEST_DATA_GEN
472 RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result);
473#endif
474
475 if (r >= 0) {
476#if ENABLE(REGEXP_TRACING)
477 m_rtMatchFoundCount++;
478#endif
479 return MatchResult(r, reinterpret_cast<unsigned*>(offsetVector)[1]);
480 }
481
482 return MatchResult::failed();
483}
484
485void RegExp::invalidateCode()
486{
487 if (!hasCode())
488 return;
489 m_state = NotCompiled;
490#if ENABLE(YARR_JIT)
491 m_regExpJITCode.clear();
492#endif
493 m_regExpBytecode.clear();
494}
495
496#if ENABLE(YARR_JIT_DEBUG)
497void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult)
498{
499 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
500 Vector<int, 32> interpreterOvector;
501 interpreterOvector.resize(offsetVectorSize);
502 int* interpreterOffsetVector = interpreterOvector.data();
503 int interpreterResult = 0;
504 int differences = 0;
505
506 // Initialize interpreterOffsetVector with the return value (index 0) and the
507 // first subpattern start indicies (even index values) set to -1.
508 // No need to init the subpattern end indicies.
509 for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)
510 interpreterOffsetVector[j] = -1;
511
512 interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector);
513
514 if (jitResult != interpreterResult)
515 differences++;
516
517 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++)
518 if ((offsetVector[j] != interpreterOffsetVector[j])
519 || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])))
520 differences++;
521
522 if (differences) {
523 dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data());
524 unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset);
525
526 dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);
527
528 if (jitResult != interpreterResult) {
529 dataLogF(" JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult);
530 differences--;
531 } else {
532 dataLogF(" Correct result = %d\n", jitResult);
533 }
534
535 if (differences) {
536 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) {
537 if (offsetVector[j] != interpreterOffsetVector[j])
538 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]);
539 if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))
540 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]);
541 }
542 }
543 }
544}
545#endif
546
547#if ENABLE(REGEXP_TRACING)
548 void RegExp::printTraceData()
549 {
550 char formattedPattern[41];
551 char rawPattern[41];
552
553 strncpy(rawPattern, pattern().utf8().data(), 40);
554 rawPattern[40]= '\0';
555
556 int pattLen = strlen(rawPattern);
557
558 snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern);
559
560#if ENABLE(YARR_JIT)
561 Yarr::YarrCodeBlock& codeBlock = m_regExpJITCode;
562
563 const size_t jitAddrSize = 20;
564 char jitAddr[jitAddrSize];
565 if (m_state == JITCode)
566 snprintf(jitAddr, jitAddrSize, "fallback");
567 else
568 snprintf(jitAddr, jitAddrSize, "0x%014lx", reinterpret_cast<unsigned long int>(codeBlock.getAddr()));
569#else
570 const char* jitAddr = "JIT Off";
571#endif
572
573 printf("%-40.40s %16.16s %10d %10d\n", formattedPattern, jitAddr, m_rtMatchCallCount, m_rtMatchFoundCount);
574 }
575#endif
576
577} // namespace JSC