X-Git-Url: https://git.saurik.com/apple/javascriptcore.git/blobdiff_plain/ba379fdc102753d6be2c4d937058fe40257329fe..a253471d7f8e4d91bf6ebabab00155c3b387d3d0:/runtime/RegExp.cpp diff --git a/runtime/RegExp.cpp b/runtime/RegExp.cpp index 7dd4a8f..b0f6760 100644 --- a/runtime/RegExp.cpp +++ b/runtime/RegExp.cpp @@ -2,6 +2,7 @@ * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org) * Copyright (c) 2007, 2008 Apple Inc. All rights reserved. * Copyright (C) 2009 Torch Mobile, Inc. + * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -21,7 +22,11 @@ #include "config.h" #include "RegExp.h" + #include "Lexer.h" +#include "RegExpCache.h" +#include "yarr/Yarr.h" +#include "yarr/YarrJIT.h" #include #include #include @@ -29,232 +34,544 @@ #include -#if ENABLE(YARR) +#define REGEXP_FUNC_TEST_DATA_GEN 0 -#include "yarr/RegexCompiler.h" -#if ENABLE(YARR_JIT) -#include "yarr/RegexJIT.h" -#else -#include "yarr/RegexInterpreter.h" -#endif +namespace JSC { -#else +const ClassInfo RegExp::s_info = { "RegExp", 0, 0, 0, CREATE_METHOD_TABLE(RegExp) }; -#if ENABLE(WREC) -#include "JIT.h" -#include "WRECGenerator.h" -#endif -#include +RegExpFlags regExpFlags(const UString& string) +{ + RegExpFlags flags = NoFlags; + + for (unsigned i = 0; i < string.length(); ++i) { + switch (string[i]) { + case 'g': + if (flags & FlagGlobal) + return InvalidFlags; + flags = static_cast(flags | FlagGlobal); + break; + + case 'i': + if (flags & FlagIgnoreCase) + return InvalidFlags; + flags = static_cast(flags | FlagIgnoreCase); + break; + + case 'm': + if (flags & FlagMultiline) + return InvalidFlags; + flags = static_cast(flags | FlagMultiline); + break; + + default: + return InvalidFlags; + } + } -#endif + return flags; +} -namespace JSC { +#if REGEXP_FUNC_TEST_DATA_GEN +class RegExpFunctionalTestCollector { + // This class is not thread safe. +protected: + static const char* const s_fileName; -#if ENABLE(WREC) -using namespace WREC; -#endif +public: + static RegExpFunctionalTestCollector* get(); -inline RegExp::RegExp(JSGlobalData* globalData, const UString& pattern) - : m_pattern(pattern) - , m_flagBits(0) - , m_constructionError(0) - , m_numSubpatterns(0) + ~RegExpFunctionalTestCollector(); + + void outputOneTest(RegExp*, UString, int, int*, int); + void clearRegExp(RegExp* regExp) + { + if (regExp == m_lastRegExp) + m_lastRegExp = 0; + } + +private: + RegExpFunctionalTestCollector(); + + void outputEscapedUString(const UString&, bool escapeSlash = false); + + static RegExpFunctionalTestCollector* s_instance; + FILE* m_file; + RegExp* m_lastRegExp; +}; + +const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData"; +RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0; + +RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get() { - compile(globalData); + if (!s_instance) + s_instance = new RegExpFunctionalTestCollector(); + + return s_instance; } -inline RegExp::RegExp(JSGlobalData* globalData, const UString& pattern, const UString& flags) - : m_pattern(pattern) +void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, UString s, int startOffset, int* ovector, int result) +{ + if ((!m_lastRegExp) || (m_lastRegExp != regExp)) { + m_lastRegExp = regExp; + fputc('/', m_file); + outputEscapedUString(regExp->pattern(), true); + fputc('/', m_file); + if (regExp->global()) + fputc('g', m_file); + if (regExp->ignoreCase()) + fputc('i', m_file); + if (regExp->multiline()) + fputc('m', m_file); + fprintf(m_file, "\n"); + } + + fprintf(m_file, " \""); + outputEscapedUString(s); + fprintf(m_file, "\", %d, %d, (", startOffset, result); + for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) { + int subpatternBegin = ovector[i * 2]; + int subpatternEnd = ovector[i * 2 + 1]; + if (subpatternBegin == -1) + subpatternEnd = -1; + fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd); + if (i < regExp->numSubpatterns()) + fputs(", ", m_file); + } + + fprintf(m_file, ")\n"); + fflush(m_file); +} + +RegExpFunctionalTestCollector::RegExpFunctionalTestCollector() +{ + m_file = fopen(s_fileName, "r+"); + if (!m_file) + m_file = fopen(s_fileName, "w+"); + + fseek(m_file, 0L, SEEK_END); +} + +RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector() +{ + fclose(m_file); + s_instance = 0; +} + +void RegExpFunctionalTestCollector::outputEscapedUString(const UString& s, bool escapeSlash) +{ + int len = s.length(); + + for (int i = 0; i < len; ++i) { + UChar c = s[i]; + + switch (c) { + case '\0': + fputs("\\0", m_file); + break; + case '\a': + fputs("\\a", m_file); + break; + case '\b': + fputs("\\b", m_file); + break; + case '\f': + fputs("\\f", m_file); + break; + case '\n': + fputs("\\n", m_file); + break; + case '\r': + fputs("\\r", m_file); + break; + case '\t': + fputs("\\t", m_file); + break; + case '\v': + fputs("\\v", m_file); + break; + case '/': + if (escapeSlash) + fputs("\\/", m_file); + else + fputs("/", m_file); + break; + case '\"': + fputs("\\\"", m_file); + break; + case '\\': + fputs("\\\\", m_file); + break; + case '\?': + fputs("\?", m_file); + break; + default: + if (c > 0x7f) + fprintf(m_file, "\\u%04x", c); + else + fputc(c, m_file); + break; + } + } +} +#endif + +RegExp::RegExp(JSGlobalData& globalData, const UString& patternString, RegExpFlags flags) + : JSCell(globalData, globalData.regExpStructure.get()) + , m_state(NotCompiled) + , m_patternString(patternString) , m_flags(flags) - , m_flagBits(0) , m_constructionError(0) , m_numSubpatterns(0) +#if ENABLE(REGEXP_TRACING) + , m_rtMatchCallCount(0) + , m_rtMatchFoundCount(0) +#endif { - // NOTE: The global flag is handled on a case-by-case basis by functions like - // String::match and RegExpObject::match. - if (flags.find('g') != -1) - m_flagBits |= Global; - if (flags.find('i') != -1) - m_flagBits |= IgnoreCase; - if (flags.find('m') != -1) - m_flagBits |= Multiline; - - compile(globalData); } -#if !ENABLE(YARR) -RegExp::~RegExp() +void RegExp::finishCreation(JSGlobalData& globalData) { - jsRegExpFree(m_regExp); + Base::finishCreation(globalData); + Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError); + if (m_constructionError) + m_state = ParseError; + else + m_numSubpatterns = pattern.m_numSubpatterns; } + +void RegExp::destroy(JSCell* cell) +{ + RegExp* thisObject = jsCast(cell); +#if REGEXP_FUNC_TEST_DATA_GEN + RegExpFunctionalTestCollector::get()->clearRegExp(this); #endif + thisObject->RegExp::~RegExp(); +} -PassRefPtr RegExp::create(JSGlobalData* globalData, const UString& pattern) +RegExp* RegExp::createWithoutCaching(JSGlobalData& globalData, const UString& patternString, RegExpFlags flags) { - return adoptRef(new RegExp(globalData, pattern)); + RegExp* regExp = new (NotNull, allocateCell(globalData.heap)) RegExp(globalData, patternString, flags); + regExp->finishCreation(globalData); + return regExp; } -PassRefPtr RegExp::create(JSGlobalData* globalData, const UString& pattern, const UString& flags) +RegExp* RegExp::create(JSGlobalData& globalData, const UString& patternString, RegExpFlags flags) { - return adoptRef(new RegExp(globalData, pattern, flags)); + return globalData.regExpCache()->lookupOrCreate(patternString, flags); } -#if ENABLE(YARR) +void RegExp::compile(JSGlobalData* globalData, Yarr::YarrCharSize charSize) +{ + Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError); + if (m_constructionError) { + ASSERT_NOT_REACHED(); + m_state = ParseError; + return; + } + ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); + + if (!hasCode()) { + ASSERT(m_state == NotCompiled); + globalData->regExpCache()->addToStrongCache(this); + m_state = ByteCode; + } + +#if ENABLE(YARR_JIT) + if (!pattern.m_containsBackreferences && globalData->canUseRegExpJIT()) { + Yarr::jitCompile(pattern, charSize, globalData, m_regExpJITCode); +#if ENABLE(YARR_JIT_DEBUG) + if (!m_regExpJITCode.isFallBack()) + m_state = JITCode; + else + m_state = ByteCode; +#else + if (!m_regExpJITCode.isFallBack()) { + m_state = JITCode; + return; + } +#endif + } +#else + UNUSED_PARAM(charSize); +#endif + + m_regExpBytecode = Yarr::byteCompile(pattern, &globalData->m_regExpAllocator); +} -void RegExp::compile(JSGlobalData* globalData) +void RegExp::compileIfNecessary(JSGlobalData& globalData, Yarr::YarrCharSize charSize) { + if (hasCode()) { #if ENABLE(YARR_JIT) - Yarr::jitCompileRegex(globalData, m_regExpJITCode, m_pattern, m_numSubpatterns, m_constructionError, ignoreCase(), multiline()); + if (m_state != JITCode) + return; + if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCode())) + return; + if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCode())) + return; #else - UNUSED_PARAM(globalData); - m_regExpBytecode.set(Yarr::byteCompileRegex(m_pattern, m_numSubpatterns, m_constructionError, ignoreCase(), multiline())); + return; #endif + } + + compile(&globalData, charSize); } -int RegExp::match(const UString& s, int startOffset, Vector* ovector) +int RegExp::match(JSGlobalData& globalData, const UString& s, unsigned startOffset, Vector& ovector) { - if (startOffset < 0) - startOffset = 0; - if (ovector) - ovector->clear(); +#if ENABLE(REGEXP_TRACING) + m_rtMatchCallCount++; +#endif + + ASSERT(m_state != ParseError); + compileIfNecessary(globalData, s.is8Bit() ? Yarr::Char8 : Yarr::Char16); - if (startOffset > s.size() || s.isNull()) - return -1; + int offsetVectorSize = (m_numSubpatterns + 1) * 2; + ovector.resize(offsetVectorSize); + int* offsetVector = ovector.data(); + int result; #if ENABLE(YARR_JIT) - if (!!m_regExpJITCode) { -#else - if (m_regExpBytecode) { + if (m_state == JITCode) { + if (s.is8Bit()) + result = m_regExpJITCode.execute(s.characters8(), startOffset, s.length(), offsetVector).start; + else + result = m_regExpJITCode.execute(s.characters16(), startOffset, s.length(), offsetVector).start; +#if ENABLE(YARR_JIT_DEBUG) + matchCompareWithInterpreter(s, startOffset, offsetVector, result); #endif - int offsetVectorSize = (m_numSubpatterns + 1) * 3; // FIXME: should be 2 - but adding temporary fallback to pcre. - int* offsetVector; - Vector nonReturnedOvector; - if (ovector) { - ovector->resize(offsetVectorSize); - offsetVector = ovector->data(); - } else { - nonReturnedOvector.resize(offsetVectorSize); - offsetVector = nonReturnedOvector.data(); + } else +#endif + result = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast(offsetVector)); + + // FIXME: The YARR engine should handle unsigned or size_t length matches. + // The YARR Interpreter is "unsigned" clean, while the YARR JIT hasn't been addressed. + // The offset vector handling needs to change as well. + // Right now we convert a match where the offsets overflowed into match failure. + // There are two places in WebCore that call the interpreter directly that need to + // have their offsets changed to int as well. They are platform/text/RegularExpression.cpp + // and inspector/ContentSearchUtils.cpp. + if (s.length() > INT_MAX) { + bool overflowed = false; + + if (result < -1) + overflowed = true; + + for (unsigned i = 0; i <= m_numSubpatterns; i++) { + if ((offsetVector[i*2] < -1) || ((offsetVector[i*2] >= 0) && (offsetVector[i*2+1] < -1))) { + overflowed = true; + offsetVector[i*2] = -1; + offsetVector[i*2+1] = -1; + } } - ASSERT(offsetVector); - for (int j = 0; j < offsetVectorSize; ++j) - offsetVector[j] = -1; + if (overflowed) + result = -1; + } + ASSERT(result >= -1); -#if ENABLE(YARR_JIT) - int result = Yarr::executeRegex(m_regExpJITCode, s.data(), startOffset, s.size(), offsetVector, offsetVectorSize); -#else - int result = Yarr::interpretRegex(m_regExpBytecode.get(), s.data(), startOffset, s.size(), offsetVector); +#if REGEXP_FUNC_TEST_DATA_GEN + RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result); #endif - if (result < 0) { -#ifndef NDEBUG - // TODO: define up a symbol, rather than magic -1 - if (result != -1) - fprintf(stderr, "jsRegExpExecute failed with result %d\n", result); +#if ENABLE(REGEXP_TRACING) + if (result != -1) + m_rtMatchFoundCount++; #endif - if (ovector) - ovector->clear(); - } - return result; - } - return -1; + return result; } +void RegExp::compileMatchOnly(JSGlobalData* globalData, Yarr::YarrCharSize charSize) +{ + Yarr::YarrPattern pattern(m_patternString, ignoreCase(), multiline(), &m_constructionError); + if (m_constructionError) { + ASSERT_NOT_REACHED(); + m_state = ParseError; + return; + } + ASSERT(m_numSubpatterns == pattern.m_numSubpatterns); + + if (!hasCode()) { + ASSERT(m_state == NotCompiled); + globalData->regExpCache()->addToStrongCache(this); + m_state = ByteCode; + } + +#if ENABLE(YARR_JIT) + if (!pattern.m_containsBackreferences && globalData->canUseRegExpJIT()) { + Yarr::jitCompile(pattern, charSize, globalData, m_regExpJITCode, Yarr::MatchOnly); +#if ENABLE(YARR_JIT_DEBUG) + if (!m_regExpJITCode.isFallBack()) + m_state = JITCode; + else + m_state = ByteCode; +#else + if (!m_regExpJITCode.isFallBack()) { + m_state = JITCode; + return; + } +#endif + } #else + UNUSED_PARAM(charSize); +#endif + + m_regExpBytecode = Yarr::byteCompile(pattern, &globalData->m_regExpAllocator); +} -void RegExp::compile(JSGlobalData* globalData) +void RegExp::compileIfNecessaryMatchOnly(JSGlobalData& globalData, Yarr::YarrCharSize charSize) { - m_regExp = 0; -#if ENABLE(WREC) - m_wrecFunction = Generator::compileRegExp(globalData, m_pattern, &m_numSubpatterns, &m_constructionError, m_executablePool, ignoreCase(), multiline()); - if (m_wrecFunction || m_constructionError) - return; - // Fall through to non-WREC case. + if (hasCode()) { +#if ENABLE(YARR_JIT) + if (m_state != JITCode) + return; + if ((charSize == Yarr::Char8) && (m_regExpJITCode.has8BitCodeMatchOnly())) + return; + if ((charSize == Yarr::Char16) && (m_regExpJITCode.has16BitCodeMatchOnly())) + return; #else - UNUSED_PARAM(globalData); + return; #endif + } - JSRegExpIgnoreCaseOption ignoreCaseOption = ignoreCase() ? JSRegExpIgnoreCase : JSRegExpDoNotIgnoreCase; - JSRegExpMultilineOption multilineOption = multiline() ? JSRegExpMultiline : JSRegExpSingleLine; - m_regExp = jsRegExpCompile(reinterpret_cast(m_pattern.data()), m_pattern.size(), ignoreCaseOption, multilineOption, &m_numSubpatterns, &m_constructionError); + compileMatchOnly(&globalData, charSize); } -int RegExp::match(const UString& s, int startOffset, Vector* ovector) +MatchResult RegExp::match(JSGlobalData& globalData, const UString& s, unsigned startOffset) { - if (startOffset < 0) - startOffset = 0; - if (ovector) - ovector->clear(); - - if (startOffset > s.size() || s.isNull()) - return -1; - -#if ENABLE(WREC) - if (m_wrecFunction) { - int offsetVectorSize = (m_numSubpatterns + 1) * 2; - int* offsetVector; - Vector nonReturnedOvector; - if (ovector) { - ovector->resize(offsetVectorSize); - offsetVector = ovector->data(); - } else { - nonReturnedOvector.resize(offsetVectorSize); - offsetVector = nonReturnedOvector.data(); - } - ASSERT(offsetVector); - for (int j = 0; j < offsetVectorSize; ++j) - offsetVector[j] = -1; +#if ENABLE(REGEXP_TRACING) + m_rtMatchCallCount++; +#endif - int result = m_wrecFunction(s.data(), startOffset, s.size(), offsetVector); + ASSERT(m_state != ParseError); + compileIfNecessaryMatchOnly(globalData, s.is8Bit() ? Yarr::Char8 : Yarr::Char16); - if (result < 0) { -#ifndef NDEBUG - // TODO: define up a symbol, rather than magic -1 - if (result != -1) - fprintf(stderr, "jsRegExpExecute failed with result %d\n", result); +#if ENABLE(YARR_JIT) + if (m_state == JITCode) { + MatchResult result = s.is8Bit() ? + m_regExpJITCode.execute(s.characters8(), startOffset, s.length()) : + m_regExpJITCode.execute(s.characters16(), startOffset, s.length()); +#if ENABLE(REGEXP_TRACING) + if (!result) + m_rtMatchFoundCount++; #endif - if (ovector) - ovector->clear(); - } return result; - } else + } #endif - if (m_regExp) { - // Set up the offset vector for the result. - // First 2/3 used for result, the last third used by PCRE. - int* offsetVector; - int offsetVectorSize; - int fixedSizeOffsetVector[3]; - if (!ovector) { - offsetVectorSize = 3; - offsetVector = fixedSizeOffsetVector; - } else { - offsetVectorSize = (m_numSubpatterns + 1) * 3; - ovector->resize(offsetVectorSize); - offsetVector = ovector->data(); - } - int numMatches = jsRegExpExecute(m_regExp, reinterpret_cast(s.data()), s.size(), startOffset, offsetVector, offsetVectorSize); - - if (numMatches < 0) { -#ifndef NDEBUG - if (numMatches != JSRegExpErrorNoMatch) - fprintf(stderr, "jsRegExpExecute failed with result %d\n", numMatches); + int offsetVectorSize = (m_numSubpatterns + 1) * 2; + int* offsetVector; + Vector nonReturnedOvector; + nonReturnedOvector.resize(offsetVectorSize); + offsetVector = nonReturnedOvector.data(); + int r = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast(offsetVector)); +#if REGEXP_FUNC_TEST_DATA_GEN + RegExpFunctionalTestCollector::get()->outputOneTest(this, s, startOffset, offsetVector, result); #endif - if (ovector) - ovector->clear(); - return -1; - } - return offsetVector[0]; + if (r >= 0) { +#if ENABLE(REGEXP_TRACING) + m_rtMatchFoundCount++; +#endif + return MatchResult(r, reinterpret_cast(offsetVector)[1]); } - return -1; + return MatchResult::failed(); } +void RegExp::invalidateCode() +{ + if (!hasCode()) + return; + m_state = NotCompiled; +#if ENABLE(YARR_JIT) + m_regExpJITCode.clear(); +#endif + m_regExpBytecode.clear(); +} + +#if ENABLE(YARR_JIT_DEBUG) +void RegExp::matchCompareWithInterpreter(const UString& s, int startOffset, int* offsetVector, int jitResult) +{ + int offsetVectorSize = (m_numSubpatterns + 1) * 2; + Vector interpreterOvector; + interpreterOvector.resize(offsetVectorSize); + int* interpreterOffsetVector = interpreterOvector.data(); + int interpreterResult = 0; + int differences = 0; + + // Initialize interpreterOffsetVector with the return value (index 0) and the + // first subpattern start indicies (even index values) set to -1. + // No need to init the subpattern end indicies. + for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++) + interpreterOffsetVector[j] = -1; + + interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, interpreterOffsetVector); + + if (jitResult != interpreterResult) + differences++; + + for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) + if ((offsetVector[j] != interpreterOffsetVector[j]) + || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))) + differences++; + + if (differences) { + dataLog("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data()); + unsigned segmentLen = s.length() - static_cast(startOffset); + + dataLog((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset); + + if (jitResult != interpreterResult) { + dataLog(" JIT result = %d, blah interpreted result = %d\n", jitResult, interpreterResult); + differences--; + } else { + dataLog(" Correct result = %d\n", jitResult); + } + + if (differences) { + for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) { + if (offsetVector[j] != interpreterOffsetVector[j]) + dataLog(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]); + if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])) + dataLog(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]); + } + } + } +} +#endif + +#if ENABLE(REGEXP_TRACING) + void RegExp::printTraceData() + { + char formattedPattern[41]; + char rawPattern[41]; + + strncpy(rawPattern, pattern().utf8().data(), 40); + rawPattern[40]= '\0'; + + int pattLen = strlen(rawPattern); + + snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern); + +#if ENABLE(YARR_JIT) + Yarr::YarrCodeBlock& codeBlock = m_regExpJITCode; + + const size_t jitAddrSize = 20; + char jitAddr[jitAddrSize]; + if (m_state == JITCode) + snprintf(jitAddr, jitAddrSize, "fallback"); + else + snprintf(jitAddr, jitAddrSize, "0x%014lx", reinterpret_cast(codeBlock.getAddr())); +#else + const char* jitAddr = "JIT Off"; +#endif + + printf("%-40.40s %16.16s %10d %10d\n", formattedPattern, jitAddr, m_rtMatchCallCount, m_rtMatchFoundCount); + } #endif } // namespace JSC