X-Git-Url: https://git.saurik.com/apple/javascriptcore.git/blobdiff_plain/14957cd040308e3eeec43d26bae5d76da13fcd85..HEAD:/yarr/YarrInterpreter.cpp diff --git a/yarr/YarrInterpreter.cpp b/yarr/YarrInterpreter.cpp index 903c692..d45096c 100644 --- a/yarr/YarrInterpreter.cpp +++ b/yarr/YarrInterpreter.cpp @@ -28,16 +28,17 @@ #include "YarrInterpreter.h" #include "Yarr.h" +#include "YarrCanonicalizeUCS2.h" #include - -#ifndef NDEBUG -#include -#endif +#include +#include +#include using namespace WTF; namespace JSC { namespace Yarr { +template class Interpreter { public: struct ParenthesesDisjunctionContext; @@ -78,8 +79,8 @@ public: static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) { - ASSERT(backTrack->matchAmount); - ASSERT(backTrack->lastContext); + RELEASE_ASSERT(backTrack->matchAmount); + RELEASE_ASSERT(backTrack->lastContext); backTrack->lastContext = backTrack->lastContext->next; --backTrack->matchAmount; } @@ -106,9 +107,8 @@ public: { size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); allocatorPool = allocatorPool->ensureCapacity(size); - if (!allocatorPool) - CRASH(); - return new(allocatorPool->alloc(size)) DisjunctionContext(); + RELEASE_ASSERT(allocatorPool); + return new (allocatorPool->alloc(size)) DisjunctionContext(); } void freeDisjunctionContext(DisjunctionContext* context) @@ -118,7 +118,7 @@ public: struct ParenthesesDisjunctionContext { - ParenthesesDisjunctionContext(int* output, ByteTerm& term) + ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term) : next(0) { unsigned firstSubpatternId = term.atom.subpatternId; @@ -126,10 +126,10 @@ public: for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; - output[(firstSubpatternId << 1) + i] = -1; + output[(firstSubpatternId << 1) + i] = offsetNoMatch; } - new(getDisjunctionContext(term)) DisjunctionContext(); + new (getDisjunctionContext(term)) DisjunctionContext(); } void* operator new(size_t, void* where) @@ -137,7 +137,7 @@ public: return where; } - void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) + void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) { for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; @@ -149,16 +149,15 @@ public: } ParenthesesDisjunctionContext* next; - int subpatternBackup[1]; + unsigned subpatternBackup[1]; }; - ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) + ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term) { - size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t); + size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + static_cast(disjunction->m_frameSize) * sizeof(uintptr_t); allocatorPool = allocatorPool->ensureCapacity(size); - if (!allocatorPool) - CRASH(); - return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); + RELEASE_ASSERT(allocatorPool); + return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term); } void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) @@ -168,7 +167,7 @@ public: class InputStream { public: - InputStream(const UChar* input, unsigned start, unsigned length) + InputStream(const CharType* input, unsigned start, unsigned length) : input(input) , pos(start) , length(length) @@ -200,11 +199,10 @@ public: return input[pos] | input[pos + 1] << 16; } - int readChecked(int position) + int readChecked(unsigned negativePositionOffest) { - ASSERT(position < 0); - ASSERT(static_cast(-position) <= pos); - unsigned p = pos + position; + RELEASE_ASSERT(pos >= negativePositionOffest); + unsigned p = pos - negativePositionOffest; ASSERT(p < length); return input[p]; } @@ -248,37 +246,39 @@ public: return length; } - bool checkInput(int count) + bool checkInput(unsigned count) { - if ((pos + count) <= length) { + if (((pos + count) <= length) && ((pos + count) >= pos)) { pos += count; return true; } return false; } - void uncheckInput(int count) + void uncheckInput(unsigned count) { + RELEASE_ASSERT(pos >= count); pos -= count; } - bool atStart(int position) + bool atStart(unsigned negativePositionOffest) { - return (pos + position) == 0; + return pos == negativePositionOffest; } - bool atEnd(int position) + bool atEnd(unsigned negativePositionOffest) { - return (pos + position) == length; + RELEASE_ASSERT(pos >= negativePositionOffest); + return (pos - negativePositionOffest) == length; } - bool isNotAvailableInput(int position) + bool isAvailableInput(unsigned offset) { - return (pos + position) > length; + return (((pos + offset) <= length) && ((pos + offset) >= pos)); } private: - const UChar* input; + const CharType* input; unsigned pos; unsigned length; }; @@ -304,45 +304,52 @@ public: return false; } - bool checkCharacter(int testChar, int inputPosition) + bool checkCharacter(int testChar, unsigned negativeInputOffset) { - return testChar == input.readChecked(inputPosition); + return testChar == input.readChecked(negativeInputOffset); } - bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) + bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset) { - int ch = input.readChecked(inputPosition); + int ch = input.readChecked(negativeInputOffset); return (loChar == ch) || (hiChar == ch); } - bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) + bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset) { - bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); + bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset)); return invert ? !match : match; } - bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) + bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset) { - int matchSize = matchEnd - matchBegin; + unsigned matchSize = (unsigned)(matchEnd - matchBegin); if (!input.checkInput(matchSize)) return false; if (pattern->m_ignoreCase) { - for (int i = 0; i < matchSize; ++i) { - int ch = input.reread(matchBegin + i); - - int lo = Unicode::toLower(ch); - int hi = Unicode::toUpper(ch); - - if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) { - input.uncheckInput(matchSize); - return false; - } + for (unsigned i = 0; i < matchSize; ++i) { + int oldCh = input.reread(matchBegin + i); + int ch = input.readChecked(negativeInputOffset + matchSize - i); + + if (oldCh == ch) + continue; + + // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that + // unicode values are never allowed to match against ascii ones. + if (isASCII(oldCh) || isASCII(ch)) { + if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) + continue; + } else if (areCanonicallyEquivalent(oldCh, ch)) + continue; + + input.uncheckInput(matchSize); + return false; } } else { - for (int i = 0; i < matchSize; ++i) { - if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { + for (unsigned i = 0; i < matchSize; ++i) { + if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) { input.uncheckInput(matchSize); return false; } @@ -354,7 +361,7 @@ public: bool matchAssertionBOL(ByteTerm& term) { - return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); + return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1))); } bool matchAssertionEOL(ByteTerm& term) @@ -367,7 +374,7 @@ public: bool matchAssertionWordBoundary(ByteTerm& term) { - bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); + bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1)); bool readIsWordchar; if (term.inputPosition) readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); @@ -397,7 +404,7 @@ public: case QuantifierNonGreedy: if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { ++backTrack->matchAmount; - if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) + if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1)) return true; } input.uncheckInput(backTrack->matchAmount); @@ -426,7 +433,7 @@ public: case QuantifierNonGreedy: if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { ++backTrack->matchAmount; - if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) + if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1)) return true; } input.uncheckInput(backTrack->matchAmount); @@ -444,7 +451,7 @@ public: switch (term.atom.quantityType) { case QuantifierFixedCount: { for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { - if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount)) return false; } return true; @@ -453,7 +460,7 @@ public: case QuantifierGreedy: { unsigned matchAmount = 0; while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { + if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) { input.uncheckInput(1); break; } @@ -469,7 +476,7 @@ public: return true; } - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); return false; } @@ -493,7 +500,7 @@ public: case QuantifierNonGreedy: if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { ++backTrack->matchAmount; - if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) + if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) return true; } input.uncheckInput(backTrack->matchAmount); @@ -508,16 +515,19 @@ public: ASSERT(term.type == ByteTerm::TypeBackReference); BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); - int matchBegin = output[(term.atom.subpatternId << 1)]; - int matchEnd = output[(term.atom.subpatternId << 1) + 1]; + unsigned matchBegin = output[(term.atom.subpatternId << 1)]; + unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. // In this case the result of match is empty string like when it references to a parentheses with zero-width match. // Eg.: /(a\1)/ - if (matchEnd == -1) + if (matchEnd == offsetNoMatch) + return true; + + if (matchBegin == offsetNoMatch) return true; - ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + ASSERT(matchBegin <= matchEnd); if (matchBegin == matchEnd) return true; @@ -548,7 +558,7 @@ public: return true; } - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); return false; } @@ -557,9 +567,13 @@ public: ASSERT(term.type == ByteTerm::TypeBackReference); BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); - int matchBegin = output[(term.atom.subpatternId << 1)]; - int matchEnd = output[(term.atom.subpatternId << 1) + 1]; - ASSERT((matchBegin == -1) || (matchBegin <= matchEnd)); + unsigned matchBegin = output[(term.atom.subpatternId << 1)]; + unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1]; + + if (matchBegin == offsetNoMatch) + return false; + + ASSERT(matchBegin <= matchEnd); if (matchBegin == matchEnd) return false; @@ -648,7 +662,7 @@ public: if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = input.getPos() + term.inputPosition; + output[(subpatternId << 1)] = input.getPos() - term.inputPosition; } return true; @@ -680,8 +694,8 @@ public: if (term.capture()) { unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = -1; - output[(subpatternId << 1) + 1] = -1; + output[(subpatternId << 1)] = offsetNoMatch; + output[(subpatternId << 1) + 1] = offsetNoMatch; } switch (term.atom.quantityType) { @@ -693,6 +707,7 @@ public: return true; case QuantifierNonGreedy: ASSERT(backTrack->begin != notFound); + FALLTHROUGH; case QuantifierFixedCount: break; } @@ -713,6 +728,7 @@ public: context->term -= term.atom.parenthesesWidth; return false; } + FALLTHROUGH; case QuantifierNonGreedy: if (backTrack->begin == notFound) { backTrack->begin = input.getPos(); @@ -728,6 +744,7 @@ public: context->term -= term.atom.parenthesesWidth; return true; } + FALLTHROUGH; case QuantifierFixedCount: break; } @@ -778,7 +795,7 @@ public: { // 'Terminal' parentheses are at the end of the regex, and as such a match past end // should always be returned as a successful match - we should never backtrack to here. - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); return false; } @@ -862,12 +879,11 @@ public: resetMatches(term, context); freeParenthesesDisjunctionContext(context); - if (result == JSRegExpNoMatch) { - JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); - if (backtrackResult != JSRegExpMatch) - return backtrackResult; - } else + if (result != JSRegExpNoMatch) return result; + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; } } @@ -905,7 +921,7 @@ public: return JSRegExpMatch; } - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); return JSRegExpErrorNoMatch; } @@ -949,12 +965,11 @@ public: resetMatches(term, context); freeParenthesesDisjunctionContext(context); - if (result == JSRegExpNoMatch) { - JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); - if (backtrackResult != JSRegExpMatch) - return backtrackResult; - } else + if (result != JSRegExpNoMatch) return result; + JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); + if (backtrackResult != JSRegExpMatch) + return backtrackResult; } } @@ -1038,14 +1053,15 @@ public: popParenthesesDisjunctionContext(backTrack); freeParenthesesDisjunctionContext(context); - return result; + if (result != JSRegExpNoMatch) + return result; } return JSRegExpNoMatch; } } - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); return JSRegExpErrorNoMatch; } @@ -1139,7 +1155,7 @@ public: case ByteTerm::TypePatternCharacterOnce: case ByteTerm::TypePatternCharacterFixed: { for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) BACKTRACK(); } MATCH_NEXT(); @@ -1148,7 +1164,7 @@ public: BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); unsigned matchAmount = 0; while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { + if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) { input.uncheckInput(1); break; } @@ -1167,7 +1183,7 @@ public: case ByteTerm::TypePatternCasedCharacterOnce: case ByteTerm::TypePatternCasedCharacterFixed: { for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) BACKTRACK(); } MATCH_NEXT(); @@ -1176,7 +1192,7 @@ public: BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); unsigned matchAmount = 0; while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { - if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { + if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) { input.uncheckInput(1); break; } @@ -1251,7 +1267,7 @@ public: } // We should never fall-through to here. - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); backtrack: ASSERT(context->term < static_cast(disjunction->terms.size())); @@ -1260,7 +1276,7 @@ public: case ByteTerm::TypeSubpatternBegin: return JSRegExpNoMatch; case ByteTerm::TypeSubpatternEnd: - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); case ByteTerm::TypeBodyAlternativeBegin: case ByteTerm::TypeBodyAlternativeDisjunction: { @@ -1282,7 +1298,7 @@ public: MATCH_NEXT(); } case ByteTerm::TypeBodyAlternativeEnd: - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); case ByteTerm::TypeAlternativeBegin: case ByteTerm::TypeAlternativeDisjunction: { @@ -1371,10 +1387,10 @@ public: BACKTRACK(); case ByteTerm::TypeDotStarEnclosure: - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); } - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); return JSRegExpErrorNoMatch; } @@ -1394,14 +1410,16 @@ public: return result; } - int interpret() + unsigned interpret() { - allocatorPool = pattern->m_allocator->startAllocator(); - if (!allocatorPool) - CRASH(); + if (!input.isAvailableInput(0)) + return offsetNoMatch; - for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) - output[i] = -1; + for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i) + output[i << 1] = offsetNoMatch; + + allocatorPool = pattern->m_allocator->startAllocator(); + RELEASE_ASSERT(allocatorPool); DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); @@ -1415,15 +1433,14 @@ public: pattern->m_allocator->stopAllocator(); - // RegExp.cpp currently expects all error to be converted to -1. - ASSERT((result == JSRegExpMatch) == (output[0] != -1)); + ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch)); return output[0]; } - Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) + Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) : pattern(pattern) , output(output) - , input(inputChar, start, length) + , input(input, start, length) , allocatorPool(0) , remainingMatchCount(matchLimit) { @@ -1431,14 +1448,12 @@ public: private: BytecodePattern* pattern; - int* output; + unsigned* output; InputStream input; BumpPointerPool* allocatorPool; unsigned remainingMatchCount; }; - - class ByteCompiler { struct ParenthesesStackEntry { unsigned beginTerm; @@ -1457,13 +1472,13 @@ public: m_currentAlternativeIndex = 0; } - PassOwnPtr compile(BumpPointerAllocator* allocator) + std::unique_ptr compile(BumpPointerAllocator* allocator) { regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); emitDisjunction(m_pattern.m_body); regexEnd(); - return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator)); + return std::make_unique(WTF::move(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator); } void checkInput(unsigned count) @@ -1476,26 +1491,29 @@ public: m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count)); } - void assertionBOL(int inputPosition) + void assertionBOL(unsigned inputPosition) { m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); } - void assertionEOL(int inputPosition) + void assertionEOL(unsigned inputPosition) { m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); } - void assertionWordBoundary(bool invert, int inputPosition) + void assertionWordBoundary(bool invert, unsigned inputPosition) { m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); } - void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked quantityCount, QuantifierType quantityType) { if (m_pattern.m_ignoreCase) { - UChar lo = Unicode::toLower(ch); - UChar hi = Unicode::toUpper(ch); + ASSERT(u_tolower(ch) <= 0xFFFF); + ASSERT(u_toupper(ch) <= 0xFFFF); + + UChar lo = u_tolower(ch); + UChar hi = u_toupper(ch); if (lo != hi) { m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); @@ -1506,27 +1524,27 @@ public: m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); } - void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked quantityCount, QuantifierType quantityType) { m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } - void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked quantityCount, QuantifierType quantityType) { ASSERT(subpatternId); m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet(); m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; } - void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) { int beginTerm = m_bodyDisjunction->terms.size(); @@ -1539,7 +1557,7 @@ public: m_currentAlternativeIndex = beginTerm + 1; } - void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) { int beginTerm = m_bodyDisjunction->terms.size(); @@ -1552,7 +1570,7 @@ public: m_currentAlternativeIndex = beginTerm + 1; } - void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) + void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) { // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin, // then fix this up at the end! - simplifying this should make it much clearer. @@ -1582,7 +1600,7 @@ public: m_currentAlternativeIndex = beginTerm + 1; } - void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked quantityCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); @@ -1598,9 +1616,9 @@ public: m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } @@ -1626,10 +1644,10 @@ public: #ifndef NDEBUG void dumpDisjunction(ByteDisjunction* disjunction) { - printf("ByteDisjunction(%p):\n\t", disjunction); + dataLogF("ByteDisjunction(%p):\n\t", disjunction); for (unsigned i = 0; i < disjunction->terms.size(); ++i) - printf("{ %d } ", disjunction->terms[i].type); - printf("\n"); + dataLogF("{ %d } ", disjunction->terms[i].type); + dataLogF("\n"); } #endif @@ -1680,7 +1698,7 @@ public: m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; } - void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) + void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); @@ -1694,24 +1712,27 @@ public: unsigned subpatternId = parenthesesBegin.atom.subpatternId; unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; - ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); + auto parenthesesDisjunction = std::make_unique(numSubpatterns, callFrameSize); + + unsigned firstTermInParentheses = beginTerm + 1; + parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2); parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); - for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) + for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses) parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); m_bodyDisjunction->terms.shrink(beginTerm); - m_allParenthesesInfo.append(parenthesesDisjunction); - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition)); + m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition)); + m_allParenthesesInfo.append(WTF::move(parenthesesDisjunction)); - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; } - void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked quantityCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); @@ -1727,13 +1748,13 @@ public: m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } - void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) + void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked quantityCount, QuantifierType quantityType) { unsigned beginTerm = popParenthesesStack(); closeAlternative(beginTerm + 1); @@ -1749,15 +1770,15 @@ public: m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet(); m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; + m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet(); m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; } void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) { - m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize)); + m_bodyDisjunction = std::make_unique(numSubpatterns, callFrameSize); m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); m_bodyDisjunction->terms[0].frameLocation = 0; m_currentAlternativeIndex = 0; @@ -1791,7 +1812,7 @@ public: for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; - PatternAlternative* alternative = disjunction->m_alternatives[alt]; + PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); if (alt) { if (disjunction == m_pattern.m_body) @@ -1814,27 +1835,27 @@ public: switch (term.type) { case PatternTerm::TypeAssertionBOL: - assertionBOL(term.inputPosition - currentCountAlreadyChecked); + assertionBOL(currentCountAlreadyChecked - term.inputPosition); break; case PatternTerm::TypeAssertionEOL: - assertionEOL(term.inputPosition - currentCountAlreadyChecked); + assertionEOL(currentCountAlreadyChecked - term.inputPosition); break; case PatternTerm::TypeAssertionWordBoundary: - assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked); + assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition); break; case PatternTerm::TypePatternCharacter: - atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); break; case PatternTerm::TypeCharacterClass: - atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); break; case PatternTerm::TypeBackReference: - atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); + atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType); break; case PatternTerm::TypeForwardReference: @@ -1850,17 +1871,17 @@ public: else alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation); + atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); } else if (term.parentheses.isTerminal) { unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); + atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount); atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType); } else { unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); + atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0); emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); } @@ -1899,29 +1920,42 @@ public: private: YarrPattern& m_pattern; - OwnPtr m_bodyDisjunction; + std::unique_ptr m_bodyDisjunction; unsigned m_currentAlternativeIndex; Vector m_parenthesesStack; - Vector m_allParenthesesInfo; + Vector> m_allParenthesesInfo; }; -PassOwnPtr byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) +std::unique_ptr byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator) { return ByteCompiler(pattern).compile(allocator); } -int interpret(BytecodePattern* bytecode, const UChar* input, unsigned start, unsigned length, int* output) +unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output) +{ + if (input.is8Bit()) + return Interpreter(bytecode, output, input.characters8(), input.length(), start).interpret(); + return Interpreter(bytecode, output, input.characters16(), input.length(), start).interpret(); +} + +unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output) +{ + return Interpreter(bytecode, output, input, length, start).interpret(); +} + +unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output) { - return Interpreter(bytecode, output, input, start, length).interpret(); + return Interpreter(bytecode, output, input, length, start).interpret(); } -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); +// These should be the same for both UChar & LChar. +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce); +COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses); } }