X-Git-Url: https://git.saurik.com/apple/javascriptcore.git/blobdiff_plain/b80e619319b1def83d1e8b4f84042b661be1be7f..14957cd040308e3eeec43d26bae5d76da13fcd85:/yarr/RegexInterpreter.cpp diff --git a/yarr/RegexInterpreter.cpp b/yarr/RegexInterpreter.cpp deleted file mode 100644 index 647b20a..0000000 --- a/yarr/RegexInterpreter.cpp +++ /dev/null @@ -1,1607 +0,0 @@ -/* - * Copyright (C) 2009 Apple Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "RegexInterpreter.h" - -#include "RegexCompiler.h" -#include "RegexPattern.h" - -#ifndef NDEBUG -#include -#endif - -#if ENABLE(YARR) - -using namespace WTF; - -namespace JSC { namespace Yarr { - -class Interpreter { -public: - struct ParenthesesDisjunctionContext; - - struct BackTrackInfoPatternCharacter { - uintptr_t matchAmount; - }; - struct BackTrackInfoCharacterClass { - uintptr_t matchAmount; - }; - struct BackTrackInfoBackReference { - uintptr_t begin; // Not really needed for greedy quantifiers. - uintptr_t matchAmount; // Not really needed for fixed quantifiers. - }; - struct BackTrackInfoAlternative { - uintptr_t offset; - }; - struct BackTrackInfoParentheticalAssertion { - uintptr_t begin; - }; - struct BackTrackInfoParenthesesOnce { - uintptr_t inParentheses; - }; - struct BackTrackInfoParentheses { - uintptr_t matchAmount; - ParenthesesDisjunctionContext* lastContext; - uintptr_t prevBegin; - uintptr_t prevEnd; - }; - - static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) - { - context->next = backTrack->lastContext; - backTrack->lastContext = context; - ++backTrack->matchAmount; - } - - static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) - { - ASSERT(backTrack->matchAmount); - ASSERT(backTrack->lastContext); - backTrack->lastContext = backTrack->lastContext->next; - --backTrack->matchAmount; - } - - struct DisjunctionContext - { - DisjunctionContext() - : term(0) - { - } - - void* operator new(size_t, void* where) - { - return where; - } - - int term; - unsigned matchBegin; - unsigned matchEnd; - uintptr_t frame[1]; - }; - - DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) - { - return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext(); - } - - void freeDisjunctionContext(DisjunctionContext* context) - { - free(context); - } - - struct ParenthesesDisjunctionContext - { - ParenthesesDisjunctionContext(int* output, ByteTerm& term) - : next(0) - { - unsigned firstSubpatternId = term.atom.subpatternId; - unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; - - for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { - subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; - output[(firstSubpatternId << 1) + i] = -1; - } - - new(getDisjunctionContext(term)) DisjunctionContext(); - } - - void* operator new(size_t, void* where) - { - return where; - } - - void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) - { - for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) - output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; - } - - DisjunctionContext* getDisjunctionContext(ByteTerm& term) - { - return reinterpret_cast(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); - } - - ParenthesesDisjunctionContext* next; - int subpatternBackup[1]; - }; - - ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) - { - return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term); - } - - void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) - { - free(context); - } - - class InputStream { - public: - InputStream(const UChar* input, unsigned start, unsigned length) - : input(input) - , pos(start) - , length(length) - { - } - - void next() - { - ++pos; - } - - void rewind(unsigned amount) - { - ASSERT(pos >= amount); - pos -= amount; - } - - int read() - { - ASSERT(pos < length); - if (pos < length) - return input[pos]; - return -1; - } - - int readChecked(int position) - { - ASSERT(position < 0); - ASSERT((unsigned)-position <= pos); - unsigned p = pos + position; - ASSERT(p < length); - return input[p]; - } - - int reread(unsigned from) - { - ASSERT(from < length); - return input[from]; - } - - int prev() - { - ASSERT(!(pos > length)); - if (pos && length) - return input[pos - 1]; - return -1; - } - - unsigned getPos() - { - return pos; - } - - void setPos(unsigned p) - { - pos = p; - } - - bool atStart() - { - return pos == 0; - } - - bool atEnd() - { - return pos == length; - } - - bool checkInput(int count) - { - if ((pos + count) <= length) { - pos += count; - return true; - } else - return false; - } - - void uncheckInput(int count) - { - pos -= count; - } - - bool atStart(int position) - { - return (pos + position) == 0; - } - - bool atEnd(int position) - { - return (pos + position) == length; - } - - private: - const UChar* input; - unsigned pos; - unsigned length; - }; - - bool testCharacterClass(CharacterClass* characterClass, int ch) - { - if (ch & 0xFF80) { - for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) - if (ch == characterClass->m_matchesUnicode[i]) - return true; - for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) - if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) - return true; - } else { - for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) - if (ch == characterClass->m_matches[i]) - return true; - for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) - if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) - return true; - } - - return false; - } - - bool checkCharacter(int testChar, int inputPosition) - { - return testChar == input.readChecked(inputPosition); - } - - bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) - { - int ch = input.readChecked(inputPosition); - return (loChar == ch) || (hiChar == ch); - } - - bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) - { - bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); - return invert ? !match : match; - } - - bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) - { - int matchSize = matchEnd - matchBegin; - - if (!input.checkInput(matchSize)) - return false; - - for (int i = 0; i < matchSize; ++i) { - if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { - input.uncheckInput(matchSize); - return false; - } - } - - return true; - } - - bool matchAssertionBOL(ByteTerm& term) - { - return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); - } - - bool matchAssertionEOL(ByteTerm& term) - { - if (term.inputPosition) - return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); - else - return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); - } - - bool matchAssertionWordBoundary(ByteTerm& term) - { - bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); - bool readIsWordchar; - if (term.inputPosition) - readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); - else - readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); - - bool wordBoundary = prevIsWordchar != readIsWordchar; - return term.invert() ? !wordBoundary : wordBoundary; - } - - bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) - { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.uncheckInput(1); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - ++backTrack->matchAmount; - if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) - return true; - } - input.uncheckInput(backTrack->matchAmount); - break; - } - - return false; - } - - bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) - { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.uncheckInput(1); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - ++backTrack->matchAmount; - if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) - return true; - } - input.uncheckInput(backTrack->matchAmount); - break; - } - - return false; - } - - bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { - if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) - return false; - } - return true; - } - - case QuantifierGreedy: { - unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { - input.uncheckInput(1); - break; - } - ++matchAmount; - } - backTrack->matchAmount = matchAmount; - - return true; - } - - case QuantifierNonGreedy: - backTrack->matchAmount = 0; - return true; - } - - ASSERT_NOT_REACHED(); - return false; - } - - bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeCharacterClass); - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.uncheckInput(1); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { - ++backTrack->matchAmount; - if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) - return true; - } - input.uncheckInput(backTrack->matchAmount); - break; - } - - return false; - } - - bool matchBackReference(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeBackReference); - BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - int matchBegin = output[(term.atom.subpatternId << 1)]; - int matchEnd = output[(term.atom.subpatternId << 1) + 1]; - ASSERT((matchBegin == -1) == (matchEnd == -1)); - ASSERT(matchBegin <= matchEnd); - - if (matchBegin == matchEnd) - return true; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - backTrack->begin = input.getPos(); - for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { - if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { - input.setPos(backTrack->begin); - return false; - } - } - return true; - } - - case QuantifierGreedy: { - unsigned matchAmount = 0; - while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) - ++matchAmount; - backTrack->matchAmount = matchAmount; - return true; - } - - case QuantifierNonGreedy: - backTrack->begin = input.getPos(); - backTrack->matchAmount = 0; - return true; - } - - ASSERT_NOT_REACHED(); - return false; - } - - bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeBackReference); - BackTrackInfoBackReference* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - int matchBegin = output[(term.atom.subpatternId << 1)]; - int matchEnd = output[(term.atom.subpatternId << 1) + 1]; - ASSERT((matchBegin == -1) == (matchEnd == -1)); - ASSERT(matchBegin <= matchEnd); - - if (matchBegin == matchEnd) - return false; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: - // for quantityCount == 1, could rewind. - input.setPos(backTrack->begin); - break; - - case QuantifierGreedy: - if (backTrack->matchAmount) { - --backTrack->matchAmount; - input.rewind(matchEnd - matchBegin); - return true; - } - break; - - case QuantifierNonGreedy: - if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { - ++backTrack->matchAmount; - return true; - } else - input.setPos(backTrack->begin); - break; - } - - return false; - } - - void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) - { - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; - output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; - } - } - void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) - { - unsigned firstSubpatternId = term.atom.subpatternId; - unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; - context->restoreOutput(output, firstSubpatternId, count); - } - void resetAssertionMatches(ByteTerm& term) - { - unsigned firstSubpatternId = term.atom.subpatternId; - unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; - for (unsigned i = 0; i < (count << 1); ++i) - output[(firstSubpatternId << 1) + i] = -1; - } - bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) - { - while (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - - if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true)) - return true; - - resetMatches(term, context); - popParenthesesDisjunctionContext(backTrack); - freeParenthesesDisjunctionContext(context); - } - - return false; - } - - bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierGreedy: { - // set this speculatively; if we get to the parens end this will be true. - backTrack->inParentheses = 1; - break; - } - case QuantifierNonGreedy: { - backTrack->inParentheses = 0; - context->term += term.atom.parenthesesWidth; - return true; - } - case QuantifierFixedCount: - break; - } - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = input.getPos() + term.inputPosition; - } - - return true; - } - - bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; - } - return true; - } - - bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = -1; - output[(subpatternId << 1) + 1] = -1; - } - - switch (term.atom.quantityType) { - case QuantifierGreedy: - // if we backtrack to this point, there is another chance - try matching nothing. - ASSERT(backTrack->inParentheses); - backTrack->inParentheses = 0; - context->term += term.atom.parenthesesWidth; - return true; - case QuantifierNonGreedy: - ASSERT(backTrack->inParentheses); - case QuantifierFixedCount: - break; - } - - return false; - } - - bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - switch (term.atom.quantityType) { - case QuantifierGreedy: - if (!backTrack->inParentheses) { - context->term -= term.atom.parenthesesWidth; - return false; - } - case QuantifierNonGreedy: - if (!backTrack->inParentheses) { - // now try to match the parens; set this speculatively. - backTrack->inParentheses = 1; - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; - } - context->term -= term.atom.parenthesesWidth; - return true; - } - case QuantifierFixedCount: - break; - } - - return false; - } - - bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - backTrack->begin = input.getPos(); - return true; - } - - bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - input.setPos(backTrack->begin); - - // We've reached the end of the parens; if they are inverted, this is failure. - if (term.invert()) { - context->term -= term.atom.parenthesesWidth; - return false; - } - - return true; - } - - bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); - ASSERT(term.atom.quantityCount == 1); - - // We've failed to match parens; if they are inverted, this is win! - if (term.invert()) { - context->term += term.atom.parenthesesWidth; - return true; - } - - return false; - } - - bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); - ASSERT(term.atom.quantityCount == 1); - - BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - input.setPos(backTrack->begin); - - context->term -= term.atom.parenthesesWidth; - return false; - } - - bool matchParentheses(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); - - BackTrackInfoParentheses* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - unsigned subpatternId = term.atom.subpatternId; - ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; - - backTrack->prevBegin = output[(subpatternId << 1)]; - backTrack->prevEnd = output[(subpatternId << 1) + 1]; - - backTrack->matchAmount = 0; - backTrack->lastContext = 0; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { - // Try to do a match, and it it succeeds, add it to the list. - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) - appendParenthesesDisjunctionContext(backTrack, context); - else { - // The match failed; try to find an alternate point to carry on from. - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - if (!parenthesesDoBacktrack(term, backTrack)) - return false; - } - } - - ASSERT(backTrack->matchAmount == term.atom.quantityCount); - ParenthesesDisjunctionContext* context = backTrack->lastContext; - recordParenthesesMatch(term, context); - return true; - } - - case QuantifierGreedy: { - while (backTrack->matchAmount < term.atom.quantityCount) { - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) - appendParenthesesDisjunctionContext(backTrack, context); - else { - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - break; - } - } - - if (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - recordParenthesesMatch(term, context); - } - return true; - } - - case QuantifierNonGreedy: - return true; - } - - ASSERT_NOT_REACHED(); - return false; - } - - // Rules for backtracking differ depending on whether this is greedy or non-greedy. - // - // Greedy matches never should try just adding more - you should already have done - // the 'more' cases. Always backtrack, at least a leetle bit. However cases where - // you backtrack an item off the list needs checking, since we'll never have matched - // the one less case. Tracking forwards, still add as much as possible. - // - // Non-greedy, we've already done the one less case, so don't match on popping. - // We haven't done the one more case, so always try to add that. - // - bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) - { - ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); - - BackTrackInfoParentheses* backTrack = reinterpret_cast(context->frame + term.frameLocation); - - if (term.capture()) { - unsigned subpatternId = term.atom.subpatternId; - output[(subpatternId << 1)] = backTrack->prevBegin; - output[(subpatternId << 1) + 1] = backTrack->prevEnd; - } - - ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; - - switch (term.atom.quantityType) { - case QuantifierFixedCount: { - ASSERT(backTrack->matchAmount == term.atom.quantityCount); - - ParenthesesDisjunctionContext* context = 0; - - if (!parenthesesDoBacktrack(term, backTrack)) - return false; - - // While we haven't yet reached our fixed limit, - while (backTrack->matchAmount < term.atom.quantityCount) { - // Try to do a match, and it it succeeds, add it to the list. - context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) - appendParenthesesDisjunctionContext(backTrack, context); - else { - // The match failed; try to find an alternate point to carry on from. - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - if (!parenthesesDoBacktrack(term, backTrack)) - return false; - } - } - - ASSERT(backTrack->matchAmount == term.atom.quantityCount); - context = backTrack->lastContext; - recordParenthesesMatch(term, context); - return true; - } - - case QuantifierGreedy: { - if (!backTrack->matchAmount) - return false; - - ParenthesesDisjunctionContext* context = backTrack->lastContext; - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { - while (backTrack->matchAmount < term.atom.quantityCount) { - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) - appendParenthesesDisjunctionContext(backTrack, context); - else { - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - break; - } - } - } else { - resetMatches(term, context); - popParenthesesDisjunctionContext(backTrack); - freeParenthesesDisjunctionContext(context); - } - - if (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - recordParenthesesMatch(term, context); - } - return true; - } - - case QuantifierNonGreedy: { - // If we've not reached the limit, try to add one more match. - if (backTrack->matchAmount < term.atom.quantityCount) { - ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) { - appendParenthesesDisjunctionContext(backTrack, context); - recordParenthesesMatch(term, context); - return true; - } else { - resetMatches(term, context); - freeParenthesesDisjunctionContext(context); - } - } - - // Nope - okay backtrack looking for an alternative. - while (backTrack->matchAmount) { - ParenthesesDisjunctionContext* context = backTrack->lastContext; - if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { - // successful backtrack! we're back in the game! - if (backTrack->matchAmount) { - context = backTrack->lastContext; - recordParenthesesMatch(term, context); - } - return true; - } - - // pop a match off the stack - resetMatches(term, context); - popParenthesesDisjunctionContext(backTrack); - freeParenthesesDisjunctionContext(context); - } - - return false; - } - } - - ASSERT_NOT_REACHED(); - return false; - } - -#define MATCH_NEXT() { ++context->term; goto matchAgain; } -#define BACKTRACK() { --context->term; goto backtrack; } -#define currentTerm() (disjunction->terms[context->term]) - bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) - { - if (btrack) - BACKTRACK(); - - context->matchBegin = input.getPos(); - context->term = 0; - - matchAgain: - ASSERT(context->term < static_cast(disjunction->terms.size())); - - switch (currentTerm().type) { - case ByteTerm::TypeSubpatternBegin: - MATCH_NEXT(); - case ByteTerm::TypeSubpatternEnd: - context->matchEnd = input.getPos(); - return true; - - case ByteTerm::TypeBodyAlternativeBegin: - MATCH_NEXT(); - case ByteTerm::TypeBodyAlternativeDisjunction: - case ByteTerm::TypeBodyAlternativeEnd: - context->matchEnd = input.getPos(); - return true; - - case ByteTerm::TypeAlternativeBegin: - MATCH_NEXT(); - case ByteTerm::TypeAlternativeDisjunction: - case ByteTerm::TypeAlternativeEnd: { - int offset = currentTerm().alternative.end; - BackTrackInfoAlternative* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - backTrack->offset = offset; - context->term += offset; - MATCH_NEXT(); - } - - case ByteTerm::TypeAssertionBOL: - if (matchAssertionBOL(currentTerm())) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeAssertionEOL: - if (matchAssertionEOL(currentTerm())) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeAssertionWordBoundary: - if (matchAssertionWordBoundary(currentTerm())) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypePatternCharacterOnce: - case ByteTerm::TypePatternCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) - BACKTRACK(); - } - MATCH_NEXT(); - } - case ByteTerm::TypePatternCharacterGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { - if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { - input.uncheckInput(1); - break; - } - ++matchAmount; - } - backTrack->matchAmount = matchAmount; - - MATCH_NEXT(); - } - case ByteTerm::TypePatternCharacterNonGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - backTrack->matchAmount = 0; - MATCH_NEXT(); - } - - case ByteTerm::TypePatternCasedCharacterOnce: - case ByteTerm::TypePatternCasedCharacterFixed: { - for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { - if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) - BACKTRACK(); - } - MATCH_NEXT(); - } - case ByteTerm::TypePatternCasedCharacterGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - unsigned matchAmount = 0; - while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { - if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { - input.uncheckInput(1); - break; - } - ++matchAmount; - } - backTrack->matchAmount = matchAmount; - - MATCH_NEXT(); - } - case ByteTerm::TypePatternCasedCharacterNonGreedy: { - BackTrackInfoPatternCharacter* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - backTrack->matchAmount = 0; - MATCH_NEXT(); - } - - case ByteTerm::TypeCharacterClass: - if (matchCharacterClass(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeBackReference: - if (matchBackReference(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: - if (matchParentheses(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternOnceBegin: - if (matchParenthesesOnceBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternOnceEnd: - if (matchParenthesesOnceEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionBegin: - if (matchParentheticalAssertionBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionEnd: - if (matchParentheticalAssertionEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypeCheckInput: - if (input.checkInput(currentTerm().checkInputCount)) - MATCH_NEXT(); - BACKTRACK(); - } - - // We should never fall-through to here. - ASSERT_NOT_REACHED(); - - backtrack: - ASSERT(context->term < static_cast(disjunction->terms.size())); - - switch (currentTerm().type) { - case ByteTerm::TypeSubpatternBegin: - return false; - case ByteTerm::TypeSubpatternEnd: - ASSERT_NOT_REACHED(); - - case ByteTerm::TypeBodyAlternativeBegin: - case ByteTerm::TypeBodyAlternativeDisjunction: { - int offset = currentTerm().alternative.next; - context->term += offset; - if (offset > 0) - MATCH_NEXT(); - - if (input.atEnd()) - return false; - - input.next(); - context->matchBegin = input.getPos(); - MATCH_NEXT(); - } - case ByteTerm::TypeBodyAlternativeEnd: - ASSERT_NOT_REACHED(); - - case ByteTerm::TypeAlternativeBegin: - case ByteTerm::TypeAlternativeDisjunction: { - int offset = currentTerm().alternative.next; - context->term += offset; - if (offset > 0) - MATCH_NEXT(); - BACKTRACK(); - } - case ByteTerm::TypeAlternativeEnd: { - // We should never backtrack back into an alternative of the main body of the regex. - BackTrackInfoAlternative* backTrack = reinterpret_cast(context->frame + currentTerm().frameLocation); - unsigned offset = backTrack->offset; - context->term -= offset; - BACKTRACK(); - } - - case ByteTerm::TypeAssertionBOL: - case ByteTerm::TypeAssertionEOL: - case ByteTerm::TypeAssertionWordBoundary: - BACKTRACK(); - - case ByteTerm::TypePatternCharacterOnce: - case ByteTerm::TypePatternCharacterFixed: - case ByteTerm::TypePatternCharacterGreedy: - case ByteTerm::TypePatternCharacterNonGreedy: - if (backtrackPatternCharacter(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypePatternCasedCharacterOnce: - case ByteTerm::TypePatternCasedCharacterFixed: - case ByteTerm::TypePatternCasedCharacterGreedy: - case ByteTerm::TypePatternCasedCharacterNonGreedy: - if (backtrackPatternCasedCharacter(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeCharacterClass: - if (backtrackCharacterClass(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeBackReference: - if (backtrackBackReference(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpattern: - if (backtrackParentheses(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternOnceBegin: - if (backtrackParenthesesOnceBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParenthesesSubpatternOnceEnd: - if (backtrackParenthesesOnceEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionBegin: - if (backtrackParentheticalAssertionBegin(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - case ByteTerm::TypeParentheticalAssertionEnd: - if (backtrackParentheticalAssertionEnd(currentTerm(), context)) - MATCH_NEXT(); - BACKTRACK(); - - case ByteTerm::TypeCheckInput: - input.uncheckInput(currentTerm().checkInputCount); - BACKTRACK(); - } - - ASSERT_NOT_REACHED(); - return false; - } - - bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) - { - if (matchDisjunction(disjunction, context, btrack)) { - while (context->matchBegin == context->matchEnd) { - if (!matchDisjunction(disjunction, context, true)) - return false; - } - return true; - } - - return false; - } - - int interpret() - { - for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) - output[i] = -1; - - DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); - - if (matchDisjunction(pattern->m_body.get(), context)) { - output[0] = context->matchBegin; - output[1] = context->matchEnd; - } - - freeDisjunctionContext(context); - - return output[0]; - } - - Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) - : pattern(pattern) - , output(output) - , input(inputChar, start, length) - { - } - -private: - BytecodePattern *pattern; - int* output; - InputStream input; -}; - - - -class ByteCompiler { - struct ParenthesesStackEntry { - unsigned beginTerm; - unsigned savedAlternativeIndex; - ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) - : beginTerm(beginTerm) - , savedAlternativeIndex(savedAlternativeIndex) - { - } - }; - -public: - ByteCompiler(RegexPattern& pattern) - : m_pattern(pattern) - { - m_bodyDisjunction = 0; - m_currentAlternativeIndex = 0; - } - - BytecodePattern* compile() - { - regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize); - emitDisjunction(m_pattern.m_body); - regexEnd(); - - return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern); - } - - void checkInput(unsigned count) - { - m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); - } - - void assertionBOL(int inputPosition) - { - m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); - } - - void assertionEOL(int inputPosition) - { - m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); - } - - void assertionWordBoundary(bool invert, int inputPosition) - { - m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); - } - - void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - { - if (m_pattern.m_ignoreCase) { - UChar lo = Unicode::toLower(ch); - UChar hi = Unicode::toUpper(ch); - - if (lo != hi) { - m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); - return; - } - } - - m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); - } - - void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - { - m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); - - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - } - - void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) - { - ASSERT(subpatternId); - - m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); - - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - } - - void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) - { - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) - { - int beginTerm = m_bodyDisjunction->terms.size(); - - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0)); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); - m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; - - m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); - m_currentAlternativeIndex = beginTerm + 1; - } - - unsigned popParenthesesStack() - { - ASSERT(m_parenthesesStack.size()); - int stackEnd = m_parenthesesStack.size() - 1; - unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; - m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; - m_parenthesesStack.shrink(stackEnd); - - ASSERT(beginTerm < m_bodyDisjunction->terms.size()); - ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); - - return beginTerm; - } - -#ifndef NDEBUG - void dumpDisjunction(ByteDisjunction* disjunction) - { - printf("ByteDisjunction(%p):\n\t", disjunction); - for (unsigned i = 0; i < disjunction->terms.size(); ++i) - printf("{ %d } ", disjunction->terms[i].type); - printf("\n"); - } -#endif - - void closeAlternative(int beginTerm) - { - int origBeginTerm = beginTerm; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); - int endIndex = m_bodyDisjunction->terms.size(); - - unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; - - if (!m_bodyDisjunction->terms[beginTerm].alternative.next) - m_bodyDisjunction->terms.remove(beginTerm); - else { - while (m_bodyDisjunction->terms[beginTerm].alternative.next) { - beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); - m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } - - m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; - - m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); - m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; - } - } - - void closeBodyAlternative() - { - int beginTerm = 0; - int origBeginTerm = 0; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); - int endIndex = m_bodyDisjunction->terms.size(); - - unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; - - while (m_bodyDisjunction->terms[beginTerm].alternative.next) { - beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; - ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); - m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } - - m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; - - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); - m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; - } - - void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) - { - unsigned beginTerm = popParenthesesStack(); - closeAlternative(beginTerm + 1); - unsigned endTerm = m_bodyDisjunction->terms.size(); - - bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; - bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; - unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; - - m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); - m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; - m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; - - if (doInline) { - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; - } else { - ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; - ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); - - bool invertOrCapture = parenthesesBegin.invertOrCapture; - unsigned subpatternId = parenthesesBegin.atom.subpatternId; - - unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; - ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); - - parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); - for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) - parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); - parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); - - m_bodyDisjunction->terms.shrink(beginTerm); - - m_allParenthesesInfo.append(parenthesesDisjunction); - m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); - - m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; - m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; - m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; - } - } - - void regexBegin(unsigned numSubpatterns, unsigned callFrameSize) - { - m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin()); - m_bodyDisjunction->terms[0].frameLocation = 0; - m_currentAlternativeIndex = 0; - } - - void regexEnd() - { - closeBodyAlternative(); - } - - void alternativeBodyDisjunction() - { - int newAlternativeIndex = m_bodyDisjunction->terms.size(); - m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; - m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction()); - - m_currentAlternativeIndex = newAlternativeIndex; - } - - void alternativeDisjunction() - { - int newAlternativeIndex = m_bodyDisjunction->terms.size(); - m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; - m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); - - m_currentAlternativeIndex = newAlternativeIndex; - } - - void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) - { - for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; - - if (alt) { - if (disjunction == m_pattern.m_body) - alternativeBodyDisjunction(); - else - alternativeDisjunction(); - } - - PatternAlternative* alternative = disjunction->m_alternatives[alt]; - unsigned minimumSize = alternative->m_minimumSize; - - ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); - unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; - if (countToCheck) - checkInput(countToCheck); - currentCountAlreadyChecked += countToCheck; - - for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { - PatternTerm& term = alternative->m_terms[i]; - - switch (term.type) { - case PatternTerm::TypeAssertionBOL: - assertionBOL(term.inputPosition - currentCountAlreadyChecked); - break; - - case PatternTerm::TypeAssertionEOL: - assertionEOL(term.inputPosition - currentCountAlreadyChecked); - break; - - case PatternTerm::TypeAssertionWordBoundary: - assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked); - break; - - case PatternTerm::TypePatternCharacter: - atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); - break; - - case PatternTerm::TypeCharacterClass: - atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); - break; - - case PatternTerm::TypeBackReference: - atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); - break; - - case PatternTerm::TypeForwardReference: - break; - - case PatternTerm::TypeParenthesesSubpattern: { - unsigned disjunctionAlreadyCheckedCount = 0; - if ((term.quantityCount == 1) && !term.parentheses.isCopy) { - if (term.quantityType == QuantifierFixedCount) { - disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize); - atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); - } else { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); - } - } else { - unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; - atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); - } - break; - } - - case PatternTerm::TypeParentheticalAssertion: { - unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion; - - atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation); - emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); - atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType); - break; - } - } - } - } - } - -private: - RegexPattern& m_pattern; - ByteDisjunction* m_bodyDisjunction; - unsigned m_currentAlternativeIndex; - Vector m_parenthesesStack; - Vector m_allParenthesesInfo; -}; - - -BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) -{ - RegexPattern pattern(ignoreCase, multiline); - - if ((error = compileRegex(patternString, pattern))) - return 0; - - numSubpatterns = pattern.m_numSubpatterns; - - return ByteCompiler(pattern).compile(); -} - -int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) -{ - return Interpreter(regex, output, input, start, length).interpret(); -} - - -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); -COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); - - -} } - -#endif