+++ /dev/null
-/*
- * Copyright (C) 2009 Apple Inc. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-#include "RegexInterpreter.h"
-
-#include "RegexCompiler.h"
-#include "RegexPattern.h"
-
-#ifndef NDEBUG
-#include <stdio.h>
-#endif
-
-#if ENABLE(YARR)
-
-using namespace WTF;
-
-namespace JSC { namespace Yarr {
-
-class Interpreter {
-public:
- struct ParenthesesDisjunctionContext;
-
- struct BackTrackInfoPatternCharacter {
- uintptr_t matchAmount;
- };
- struct BackTrackInfoCharacterClass {
- uintptr_t matchAmount;
- };
- struct BackTrackInfoBackReference {
- uintptr_t begin; // Not really needed for greedy quantifiers.
- uintptr_t matchAmount; // Not really needed for fixed quantifiers.
- };
- struct BackTrackInfoAlternative {
- uintptr_t offset;
- };
- struct BackTrackInfoParentheticalAssertion {
- uintptr_t begin;
- };
- struct BackTrackInfoParenthesesOnce {
- uintptr_t inParentheses;
- };
- struct BackTrackInfoParentheses {
- uintptr_t matchAmount;
- ParenthesesDisjunctionContext* lastContext;
- uintptr_t prevBegin;
- uintptr_t prevEnd;
- };
-
- static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
- {
- context->next = backTrack->lastContext;
- backTrack->lastContext = context;
- ++backTrack->matchAmount;
- }
-
- static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
- {
- ASSERT(backTrack->matchAmount);
- ASSERT(backTrack->lastContext);
- backTrack->lastContext = backTrack->lastContext->next;
- --backTrack->matchAmount;
- }
-
- struct DisjunctionContext
- {
- DisjunctionContext()
- : term(0)
- {
- }
-
- void* operator new(size_t, void* where)
- {
- return where;
- }
-
- int term;
- unsigned matchBegin;
- unsigned matchEnd;
- uintptr_t frame[1];
- };
-
- DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
- {
- return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext();
- }
-
- void freeDisjunctionContext(DisjunctionContext* context)
- {
- free(context);
- }
-
- struct ParenthesesDisjunctionContext
- {
- ParenthesesDisjunctionContext(int* output, ByteTerm& term)
- : next(0)
- {
- unsigned firstSubpatternId = term.atom.subpatternId;
- unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
-
- for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
- subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
- output[(firstSubpatternId << 1) + i] = -1;
- }
-
- new(getDisjunctionContext(term)) DisjunctionContext();
- }
-
- void* operator new(size_t, void* where)
- {
- return where;
- }
-
- void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
- {
- for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
- output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
- }
-
- DisjunctionContext* getDisjunctionContext(ByteTerm& term)
- {
- return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
- }
-
- ParenthesesDisjunctionContext* next;
- int subpatternBackup[1];
- };
-
- ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
- {
- return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term);
- }
-
- void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
- {
- free(context);
- }
-
- class InputStream {
- public:
- InputStream(const UChar* input, unsigned start, unsigned length)
- : input(input)
- , pos(start)
- , length(length)
- {
- }
-
- void next()
- {
- ++pos;
- }
-
- void rewind(unsigned amount)
- {
- ASSERT(pos >= amount);
- pos -= amount;
- }
-
- int read()
- {
- ASSERT(pos < length);
- if (pos < length)
- return input[pos];
- return -1;
- }
-
- int readChecked(int position)
- {
- ASSERT(position < 0);
- ASSERT((unsigned)-position <= pos);
- unsigned p = pos + position;
- ASSERT(p < length);
- return input[p];
- }
-
- int reread(unsigned from)
- {
- ASSERT(from < length);
- return input[from];
- }
-
- int prev()
- {
- ASSERT(!(pos > length));
- if (pos && length)
- return input[pos - 1];
- return -1;
- }
-
- unsigned getPos()
- {
- return pos;
- }
-
- void setPos(unsigned p)
- {
- pos = p;
- }
-
- bool atStart()
- {
- return pos == 0;
- }
-
- bool atEnd()
- {
- return pos == length;
- }
-
- bool checkInput(int count)
- {
- if ((pos + count) <= length) {
- pos += count;
- return true;
- } else
- return false;
- }
-
- void uncheckInput(int count)
- {
- pos -= count;
- }
-
- bool atStart(int position)
- {
- return (pos + position) == 0;
- }
-
- bool atEnd(int position)
- {
- return (pos + position) == length;
- }
-
- private:
- const UChar* input;
- unsigned pos;
- unsigned length;
- };
-
- bool testCharacterClass(CharacterClass* characterClass, int ch)
- {
- if (ch & 0xFF80) {
- for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
- if (ch == characterClass->m_matchesUnicode[i])
- return true;
- for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
- if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
- return true;
- } else {
- for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
- if (ch == characterClass->m_matches[i])
- return true;
- for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
- if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
- return true;
- }
-
- return false;
- }
-
- bool checkCharacter(int testChar, int inputPosition)
- {
- return testChar == input.readChecked(inputPosition);
- }
-
- bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
- {
- int ch = input.readChecked(inputPosition);
- return (loChar == ch) || (hiChar == ch);
- }
-
- bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
- {
- bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
- return invert ? !match : match;
- }
-
- bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
- {
- int matchSize = matchEnd - matchBegin;
-
- if (!input.checkInput(matchSize))
- return false;
-
- for (int i = 0; i < matchSize; ++i) {
- if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
- input.uncheckInput(matchSize);
- return false;
- }
- }
-
- return true;
- }
-
- bool matchAssertionBOL(ByteTerm& term)
- {
- return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
- }
-
- bool matchAssertionEOL(ByteTerm& term)
- {
- if (term.inputPosition)
- return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
- else
- return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
- }
-
- bool matchAssertionWordBoundary(ByteTerm& term)
- {
- bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
- bool readIsWordchar;
- if (term.inputPosition)
- readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
- else
- readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
-
- bool wordBoundary = prevIsWordchar != readIsWordchar;
- return term.invert() ? !wordBoundary : wordBoundary;
- }
-
- bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
- {
- BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
-
- switch (term.atom.quantityType) {
- case QuantifierFixedCount:
- break;
-
- case QuantifierGreedy:
- if (backTrack->matchAmount) {
- --backTrack->matchAmount;
- input.uncheckInput(1);
- return true;
- }
- break;
-
- case QuantifierNonGreedy:
- if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
- ++backTrack->matchAmount;
- if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
- return true;
- }
- input.uncheckInput(backTrack->matchAmount);
- break;
- }
-
- return false;
- }
-
- bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
- {
- BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
-
- switch (term.atom.quantityType) {
- case QuantifierFixedCount:
- break;
-
- case QuantifierGreedy:
- if (backTrack->matchAmount) {
- --backTrack->matchAmount;
- input.uncheckInput(1);
- return true;
- }
- break;
-
- case QuantifierNonGreedy:
- if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
- ++backTrack->matchAmount;
- if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
- return true;
- }
- input.uncheckInput(backTrack->matchAmount);
- break;
- }
-
- return false;
- }
-
- bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeCharacterClass);
- BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
-
- switch (term.atom.quantityType) {
- case QuantifierFixedCount: {
- for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
- if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
- return false;
- }
- return true;
- }
-
- case QuantifierGreedy: {
- unsigned matchAmount = 0;
- while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
- if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
- input.uncheckInput(1);
- break;
- }
- ++matchAmount;
- }
- backTrack->matchAmount = matchAmount;
-
- return true;
- }
-
- case QuantifierNonGreedy:
- backTrack->matchAmount = 0;
- return true;
- }
-
- ASSERT_NOT_REACHED();
- return false;
- }
-
- bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeCharacterClass);
- BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
-
- switch (term.atom.quantityType) {
- case QuantifierFixedCount:
- break;
-
- case QuantifierGreedy:
- if (backTrack->matchAmount) {
- --backTrack->matchAmount;
- input.uncheckInput(1);
- return true;
- }
- break;
-
- case QuantifierNonGreedy:
- if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
- ++backTrack->matchAmount;
- if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
- return true;
- }
- input.uncheckInput(backTrack->matchAmount);
- break;
- }
-
- return false;
- }
-
- bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeBackReference);
- BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
-
- int matchBegin = output[(term.atom.subpatternId << 1)];
- int matchEnd = output[(term.atom.subpatternId << 1) + 1];
- ASSERT((matchBegin == -1) == (matchEnd == -1));
- ASSERT(matchBegin <= matchEnd);
-
- if (matchBegin == matchEnd)
- return true;
-
- switch (term.atom.quantityType) {
- case QuantifierFixedCount: {
- backTrack->begin = input.getPos();
- for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
- if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
- input.setPos(backTrack->begin);
- return false;
- }
- }
- return true;
- }
-
- case QuantifierGreedy: {
- unsigned matchAmount = 0;
- while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
- ++matchAmount;
- backTrack->matchAmount = matchAmount;
- return true;
- }
-
- case QuantifierNonGreedy:
- backTrack->begin = input.getPos();
- backTrack->matchAmount = 0;
- return true;
- }
-
- ASSERT_NOT_REACHED();
- return false;
- }
-
- bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeBackReference);
- BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
-
- int matchBegin = output[(term.atom.subpatternId << 1)];
- int matchEnd = output[(term.atom.subpatternId << 1) + 1];
- ASSERT((matchBegin == -1) == (matchEnd == -1));
- ASSERT(matchBegin <= matchEnd);
-
- if (matchBegin == matchEnd)
- return false;
-
- switch (term.atom.quantityType) {
- case QuantifierFixedCount:
- // for quantityCount == 1, could rewind.
- input.setPos(backTrack->begin);
- break;
-
- case QuantifierGreedy:
- if (backTrack->matchAmount) {
- --backTrack->matchAmount;
- input.rewind(matchEnd - matchBegin);
- return true;
- }
- break;
-
- case QuantifierNonGreedy:
- if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
- ++backTrack->matchAmount;
- return true;
- } else
- input.setPos(backTrack->begin);
- break;
- }
-
- return false;
- }
-
- void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
- {
- if (term.capture()) {
- unsigned subpatternId = term.atom.subpatternId;
- output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
- output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
- }
- }
- void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
- {
- unsigned firstSubpatternId = term.atom.subpatternId;
- unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
- context->restoreOutput(output, firstSubpatternId, count);
- }
- void resetAssertionMatches(ByteTerm& term)
- {
- unsigned firstSubpatternId = term.atom.subpatternId;
- unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
- for (unsigned i = 0; i < (count << 1); ++i)
- output[(firstSubpatternId << 1) + i] = -1;
- }
- bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
- {
- while (backTrack->matchAmount) {
- ParenthesesDisjunctionContext* context = backTrack->lastContext;
-
- if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true))
- return true;
-
- resetMatches(term, context);
- popParenthesesDisjunctionContext(backTrack);
- freeParenthesesDisjunctionContext(context);
- }
-
- return false;
- }
-
- bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
- ASSERT(term.atom.quantityCount == 1);
-
- BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
-
- switch (term.atom.quantityType) {
- case QuantifierGreedy: {
- // set this speculatively; if we get to the parens end this will be true.
- backTrack->inParentheses = 1;
- break;
- }
- case QuantifierNonGreedy: {
- backTrack->inParentheses = 0;
- context->term += term.atom.parenthesesWidth;
- return true;
- }
- case QuantifierFixedCount:
- break;
- }
-
- if (term.capture()) {
- unsigned subpatternId = term.atom.subpatternId;
- output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
- }
-
- return true;
- }
-
- bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
- {
- ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
- ASSERT(term.atom.quantityCount == 1);
-
- if (term.capture()) {
- unsigned subpatternId = term.atom.subpatternId;
- output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
- }
- return true;
- }
-
- bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
- ASSERT(term.atom.quantityCount == 1);
-
- BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
-
- if (term.capture()) {
- unsigned subpatternId = term.atom.subpatternId;
- output[(subpatternId << 1)] = -1;
- output[(subpatternId << 1) + 1] = -1;
- }
-
- switch (term.atom.quantityType) {
- case QuantifierGreedy:
- // if we backtrack to this point, there is another chance - try matching nothing.
- ASSERT(backTrack->inParentheses);
- backTrack->inParentheses = 0;
- context->term += term.atom.parenthesesWidth;
- return true;
- case QuantifierNonGreedy:
- ASSERT(backTrack->inParentheses);
- case QuantifierFixedCount:
- break;
- }
-
- return false;
- }
-
- bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
- ASSERT(term.atom.quantityCount == 1);
-
- BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
-
- switch (term.atom.quantityType) {
- case QuantifierGreedy:
- if (!backTrack->inParentheses) {
- context->term -= term.atom.parenthesesWidth;
- return false;
- }
- case QuantifierNonGreedy:
- if (!backTrack->inParentheses) {
- // now try to match the parens; set this speculatively.
- backTrack->inParentheses = 1;
- if (term.capture()) {
- unsigned subpatternId = term.atom.subpatternId;
- output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
- }
- context->term -= term.atom.parenthesesWidth;
- return true;
- }
- case QuantifierFixedCount:
- break;
- }
-
- return false;
- }
-
- bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
- ASSERT(term.atom.quantityCount == 1);
-
- BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
-
- backTrack->begin = input.getPos();
- return true;
- }
-
- bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
- ASSERT(term.atom.quantityCount == 1);
-
- BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
-
- input.setPos(backTrack->begin);
-
- // We've reached the end of the parens; if they are inverted, this is failure.
- if (term.invert()) {
- context->term -= term.atom.parenthesesWidth;
- return false;
- }
-
- return true;
- }
-
- bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
- ASSERT(term.atom.quantityCount == 1);
-
- // We've failed to match parens; if they are inverted, this is win!
- if (term.invert()) {
- context->term += term.atom.parenthesesWidth;
- return true;
- }
-
- return false;
- }
-
- bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
- ASSERT(term.atom.quantityCount == 1);
-
- BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
-
- input.setPos(backTrack->begin);
-
- context->term -= term.atom.parenthesesWidth;
- return false;
- }
-
- bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
-
- BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
-
- unsigned subpatternId = term.atom.subpatternId;
- ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
-
- backTrack->prevBegin = output[(subpatternId << 1)];
- backTrack->prevEnd = output[(subpatternId << 1) + 1];
-
- backTrack->matchAmount = 0;
- backTrack->lastContext = 0;
-
- switch (term.atom.quantityType) {
- case QuantifierFixedCount: {
- // While we haven't yet reached our fixed limit,
- while (backTrack->matchAmount < term.atom.quantityCount) {
- // Try to do a match, and it it succeeds, add it to the list.
- ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
- appendParenthesesDisjunctionContext(backTrack, context);
- else {
- // The match failed; try to find an alternate point to carry on from.
- resetMatches(term, context);
- freeParenthesesDisjunctionContext(context);
- if (!parenthesesDoBacktrack(term, backTrack))
- return false;
- }
- }
-
- ASSERT(backTrack->matchAmount == term.atom.quantityCount);
- ParenthesesDisjunctionContext* context = backTrack->lastContext;
- recordParenthesesMatch(term, context);
- return true;
- }
-
- case QuantifierGreedy: {
- while (backTrack->matchAmount < term.atom.quantityCount) {
- ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
- appendParenthesesDisjunctionContext(backTrack, context);
- else {
- resetMatches(term, context);
- freeParenthesesDisjunctionContext(context);
- break;
- }
- }
-
- if (backTrack->matchAmount) {
- ParenthesesDisjunctionContext* context = backTrack->lastContext;
- recordParenthesesMatch(term, context);
- }
- return true;
- }
-
- case QuantifierNonGreedy:
- return true;
- }
-
- ASSERT_NOT_REACHED();
- return false;
- }
-
- // Rules for backtracking differ depending on whether this is greedy or non-greedy.
- //
- // Greedy matches never should try just adding more - you should already have done
- // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
- // you backtrack an item off the list needs checking, since we'll never have matched
- // the one less case. Tracking forwards, still add as much as possible.
- //
- // Non-greedy, we've already done the one less case, so don't match on popping.
- // We haven't done the one more case, so always try to add that.
- //
- bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
- {
- ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
-
- BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
-
- if (term.capture()) {
- unsigned subpatternId = term.atom.subpatternId;
- output[(subpatternId << 1)] = backTrack->prevBegin;
- output[(subpatternId << 1) + 1] = backTrack->prevEnd;
- }
-
- ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
-
- switch (term.atom.quantityType) {
- case QuantifierFixedCount: {
- ASSERT(backTrack->matchAmount == term.atom.quantityCount);
-
- ParenthesesDisjunctionContext* context = 0;
-
- if (!parenthesesDoBacktrack(term, backTrack))
- return false;
-
- // While we haven't yet reached our fixed limit,
- while (backTrack->matchAmount < term.atom.quantityCount) {
- // Try to do a match, and it it succeeds, add it to the list.
- context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
- appendParenthesesDisjunctionContext(backTrack, context);
- else {
- // The match failed; try to find an alternate point to carry on from.
- resetMatches(term, context);
- freeParenthesesDisjunctionContext(context);
- if (!parenthesesDoBacktrack(term, backTrack))
- return false;
- }
- }
-
- ASSERT(backTrack->matchAmount == term.atom.quantityCount);
- context = backTrack->lastContext;
- recordParenthesesMatch(term, context);
- return true;
- }
-
- case QuantifierGreedy: {
- if (!backTrack->matchAmount)
- return false;
-
- ParenthesesDisjunctionContext* context = backTrack->lastContext;
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
- while (backTrack->matchAmount < term.atom.quantityCount) {
- ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term)))
- appendParenthesesDisjunctionContext(backTrack, context);
- else {
- resetMatches(term, context);
- freeParenthesesDisjunctionContext(context);
- break;
- }
- }
- } else {
- resetMatches(term, context);
- popParenthesesDisjunctionContext(backTrack);
- freeParenthesesDisjunctionContext(context);
- }
-
- if (backTrack->matchAmount) {
- ParenthesesDisjunctionContext* context = backTrack->lastContext;
- recordParenthesesMatch(term, context);
- }
- return true;
- }
-
- case QuantifierNonGreedy: {
- // If we've not reached the limit, try to add one more match.
- if (backTrack->matchAmount < term.atom.quantityCount) {
- ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) {
- appendParenthesesDisjunctionContext(backTrack, context);
- recordParenthesesMatch(term, context);
- return true;
- } else {
- resetMatches(term, context);
- freeParenthesesDisjunctionContext(context);
- }
- }
-
- // Nope - okay backtrack looking for an alternative.
- while (backTrack->matchAmount) {
- ParenthesesDisjunctionContext* context = backTrack->lastContext;
- if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) {
- // successful backtrack! we're back in the game!
- if (backTrack->matchAmount) {
- context = backTrack->lastContext;
- recordParenthesesMatch(term, context);
- }
- return true;
- }
-
- // pop a match off the stack
- resetMatches(term, context);
- popParenthesesDisjunctionContext(backTrack);
- freeParenthesesDisjunctionContext(context);
- }
-
- return false;
- }
- }
-
- ASSERT_NOT_REACHED();
- return false;
- }
-
-#define MATCH_NEXT() { ++context->term; goto matchAgain; }
-#define BACKTRACK() { --context->term; goto backtrack; }
-#define currentTerm() (disjunction->terms[context->term])
- bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
- {
- if (btrack)
- BACKTRACK();
-
- context->matchBegin = input.getPos();
- context->term = 0;
-
- matchAgain:
- ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
-
- switch (currentTerm().type) {
- case ByteTerm::TypeSubpatternBegin:
- MATCH_NEXT();
- case ByteTerm::TypeSubpatternEnd:
- context->matchEnd = input.getPos();
- return true;
-
- case ByteTerm::TypeBodyAlternativeBegin:
- MATCH_NEXT();
- case ByteTerm::TypeBodyAlternativeDisjunction:
- case ByteTerm::TypeBodyAlternativeEnd:
- context->matchEnd = input.getPos();
- return true;
-
- case ByteTerm::TypeAlternativeBegin:
- MATCH_NEXT();
- case ByteTerm::TypeAlternativeDisjunction:
- case ByteTerm::TypeAlternativeEnd: {
- int offset = currentTerm().alternative.end;
- BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
- backTrack->offset = offset;
- context->term += offset;
- MATCH_NEXT();
- }
-
- case ByteTerm::TypeAssertionBOL:
- if (matchAssertionBOL(currentTerm()))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeAssertionEOL:
- if (matchAssertionEOL(currentTerm()))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeAssertionWordBoundary:
- if (matchAssertionWordBoundary(currentTerm()))
- MATCH_NEXT();
- BACKTRACK();
-
- case ByteTerm::TypePatternCharacterOnce:
- case ByteTerm::TypePatternCharacterFixed: {
- for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
- if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
- BACKTRACK();
- }
- MATCH_NEXT();
- }
- case ByteTerm::TypePatternCharacterGreedy: {
- BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
- unsigned matchAmount = 0;
- while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
- if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
- input.uncheckInput(1);
- break;
- }
- ++matchAmount;
- }
- backTrack->matchAmount = matchAmount;
-
- MATCH_NEXT();
- }
- case ByteTerm::TypePatternCharacterNonGreedy: {
- BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
- backTrack->matchAmount = 0;
- MATCH_NEXT();
- }
-
- case ByteTerm::TypePatternCasedCharacterOnce:
- case ByteTerm::TypePatternCasedCharacterFixed: {
- for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
- if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
- BACKTRACK();
- }
- MATCH_NEXT();
- }
- case ByteTerm::TypePatternCasedCharacterGreedy: {
- BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
- unsigned matchAmount = 0;
- while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
- if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
- input.uncheckInput(1);
- break;
- }
- ++matchAmount;
- }
- backTrack->matchAmount = matchAmount;
-
- MATCH_NEXT();
- }
- case ByteTerm::TypePatternCasedCharacterNonGreedy: {
- BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
- backTrack->matchAmount = 0;
- MATCH_NEXT();
- }
-
- case ByteTerm::TypeCharacterClass:
- if (matchCharacterClass(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeBackReference:
- if (matchBackReference(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParenthesesSubpattern:
- if (matchParentheses(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParenthesesSubpatternOnceBegin:
- if (matchParenthesesOnceBegin(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParenthesesSubpatternOnceEnd:
- if (matchParenthesesOnceEnd(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParentheticalAssertionBegin:
- if (matchParentheticalAssertionBegin(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParentheticalAssertionEnd:
- if (matchParentheticalAssertionEnd(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
-
- case ByteTerm::TypeCheckInput:
- if (input.checkInput(currentTerm().checkInputCount))
- MATCH_NEXT();
- BACKTRACK();
- }
-
- // We should never fall-through to here.
- ASSERT_NOT_REACHED();
-
- backtrack:
- ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
-
- switch (currentTerm().type) {
- case ByteTerm::TypeSubpatternBegin:
- return false;
- case ByteTerm::TypeSubpatternEnd:
- ASSERT_NOT_REACHED();
-
- case ByteTerm::TypeBodyAlternativeBegin:
- case ByteTerm::TypeBodyAlternativeDisjunction: {
- int offset = currentTerm().alternative.next;
- context->term += offset;
- if (offset > 0)
- MATCH_NEXT();
-
- if (input.atEnd())
- return false;
-
- input.next();
- context->matchBegin = input.getPos();
- MATCH_NEXT();
- }
- case ByteTerm::TypeBodyAlternativeEnd:
- ASSERT_NOT_REACHED();
-
- case ByteTerm::TypeAlternativeBegin:
- case ByteTerm::TypeAlternativeDisjunction: {
- int offset = currentTerm().alternative.next;
- context->term += offset;
- if (offset > 0)
- MATCH_NEXT();
- BACKTRACK();
- }
- case ByteTerm::TypeAlternativeEnd: {
- // We should never backtrack back into an alternative of the main body of the regex.
- BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
- unsigned offset = backTrack->offset;
- context->term -= offset;
- BACKTRACK();
- }
-
- case ByteTerm::TypeAssertionBOL:
- case ByteTerm::TypeAssertionEOL:
- case ByteTerm::TypeAssertionWordBoundary:
- BACKTRACK();
-
- case ByteTerm::TypePatternCharacterOnce:
- case ByteTerm::TypePatternCharacterFixed:
- case ByteTerm::TypePatternCharacterGreedy:
- case ByteTerm::TypePatternCharacterNonGreedy:
- if (backtrackPatternCharacter(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypePatternCasedCharacterOnce:
- case ByteTerm::TypePatternCasedCharacterFixed:
- case ByteTerm::TypePatternCasedCharacterGreedy:
- case ByteTerm::TypePatternCasedCharacterNonGreedy:
- if (backtrackPatternCasedCharacter(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeCharacterClass:
- if (backtrackCharacterClass(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeBackReference:
- if (backtrackBackReference(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParenthesesSubpattern:
- if (backtrackParentheses(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParenthesesSubpatternOnceBegin:
- if (backtrackParenthesesOnceBegin(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParenthesesSubpatternOnceEnd:
- if (backtrackParenthesesOnceEnd(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParentheticalAssertionBegin:
- if (backtrackParentheticalAssertionBegin(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
- case ByteTerm::TypeParentheticalAssertionEnd:
- if (backtrackParentheticalAssertionEnd(currentTerm(), context))
- MATCH_NEXT();
- BACKTRACK();
-
- case ByteTerm::TypeCheckInput:
- input.uncheckInput(currentTerm().checkInputCount);
- BACKTRACK();
- }
-
- ASSERT_NOT_REACHED();
- return false;
- }
-
- bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
- {
- if (matchDisjunction(disjunction, context, btrack)) {
- while (context->matchBegin == context->matchEnd) {
- if (!matchDisjunction(disjunction, context, true))
- return false;
- }
- return true;
- }
-
- return false;
- }
-
- int interpret()
- {
- for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
- output[i] = -1;
-
- DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
-
- if (matchDisjunction(pattern->m_body.get(), context)) {
- output[0] = context->matchBegin;
- output[1] = context->matchEnd;
- }
-
- freeDisjunctionContext(context);
-
- return output[0];
- }
-
- Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
- : pattern(pattern)
- , output(output)
- , input(inputChar, start, length)
- {
- }
-
-private:
- BytecodePattern *pattern;
- int* output;
- InputStream input;
-};
-
-
-
-class ByteCompiler {
- struct ParenthesesStackEntry {
- unsigned beginTerm;
- unsigned savedAlternativeIndex;
- ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
- : beginTerm(beginTerm)
- , savedAlternativeIndex(savedAlternativeIndex)
- {
- }
- };
-
-public:
- ByteCompiler(RegexPattern& pattern)
- : m_pattern(pattern)
- {
- m_bodyDisjunction = 0;
- m_currentAlternativeIndex = 0;
- }
-
- BytecodePattern* compile()
- {
- regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize);
- emitDisjunction(m_pattern.m_body);
- regexEnd();
-
- return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern);
- }
-
- void checkInput(unsigned count)
- {
- m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
- }
-
- void assertionBOL(int inputPosition)
- {
- m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
- }
-
- void assertionEOL(int inputPosition)
- {
- m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
- }
-
- void assertionWordBoundary(bool invert, int inputPosition)
- {
- m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
- }
-
- void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
- {
- if (m_pattern.m_ignoreCase) {
- UChar lo = Unicode::toLower(ch);
- UChar hi = Unicode::toUpper(ch);
-
- if (lo != hi) {
- m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
- return;
- }
- }
-
- m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
- }
-
- void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
- {
- m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
-
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
- }
-
- void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
- {
- ASSERT(subpatternId);
-
- m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
-
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
- }
-
- void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
- {
- int beginTerm = m_bodyDisjunction->terms.size();
-
- m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
- m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
-
- m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
- m_currentAlternativeIndex = beginTerm + 1;
- }
-
- void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
- {
- int beginTerm = m_bodyDisjunction->terms.size();
-
- m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0));
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
- m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
- m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
-
- m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
- m_currentAlternativeIndex = beginTerm + 1;
- }
-
- unsigned popParenthesesStack()
- {
- ASSERT(m_parenthesesStack.size());
- int stackEnd = m_parenthesesStack.size() - 1;
- unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
- m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
- m_parenthesesStack.shrink(stackEnd);
-
- ASSERT(beginTerm < m_bodyDisjunction->terms.size());
- ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
-
- return beginTerm;
- }
-
-#ifndef NDEBUG
- void dumpDisjunction(ByteDisjunction* disjunction)
- {
- printf("ByteDisjunction(%p):\n\t", disjunction);
- for (unsigned i = 0; i < disjunction->terms.size(); ++i)
- printf("{ %d } ", disjunction->terms[i].type);
- printf("\n");
- }
-#endif
-
- void closeAlternative(int beginTerm)
- {
- int origBeginTerm = beginTerm;
- ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
- int endIndex = m_bodyDisjunction->terms.size();
-
- unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
-
- if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
- m_bodyDisjunction->terms.remove(beginTerm);
- else {
- while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
- beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
- ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
- m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
- m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
- }
-
- m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
-
- m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
- m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
- }
- }
-
- void closeBodyAlternative()
- {
- int beginTerm = 0;
- int origBeginTerm = 0;
- ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
- int endIndex = m_bodyDisjunction->terms.size();
-
- unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
-
- while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
- beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
- ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
- m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
- m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
- }
-
- m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
-
- m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
- m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
- }
-
- void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
- {
- unsigned beginTerm = popParenthesesStack();
- closeAlternative(beginTerm + 1);
- unsigned endTerm = m_bodyDisjunction->terms.size();
-
- bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
- bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture;
- unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
-
- m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
- m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
- m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
- m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
-
- if (doInline) {
- m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
- m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
- } else {
- ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
- ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
-
- bool invertOrCapture = parenthesesBegin.invertOrCapture;
- unsigned subpatternId = parenthesesBegin.atom.subpatternId;
-
- unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
- ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
-
- parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
- for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
- parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
- parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
-
- m_bodyDisjunction->terms.shrink(beginTerm);
-
- m_allParenthesesInfo.append(parenthesesDisjunction);
- m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
-
- m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount;
- m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
- m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
- }
- }
-
- void regexBegin(unsigned numSubpatterns, unsigned callFrameSize)
- {
- m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
- m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin());
- m_bodyDisjunction->terms[0].frameLocation = 0;
- m_currentAlternativeIndex = 0;
- }
-
- void regexEnd()
- {
- closeBodyAlternative();
- }
-
- void alternativeBodyDisjunction()
- {
- int newAlternativeIndex = m_bodyDisjunction->terms.size();
- m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
- m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction());
-
- m_currentAlternativeIndex = newAlternativeIndex;
- }
-
- void alternativeDisjunction()
- {
- int newAlternativeIndex = m_bodyDisjunction->terms.size();
- m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
- m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
-
- m_currentAlternativeIndex = newAlternativeIndex;
- }
-
- void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
- {
- for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
- unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
-
- if (alt) {
- if (disjunction == m_pattern.m_body)
- alternativeBodyDisjunction();
- else
- alternativeDisjunction();
- }
-
- PatternAlternative* alternative = disjunction->m_alternatives[alt];
- unsigned minimumSize = alternative->m_minimumSize;
-
- ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
- unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
- if (countToCheck)
- checkInput(countToCheck);
- currentCountAlreadyChecked += countToCheck;
-
- for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
- PatternTerm& term = alternative->m_terms[i];
-
- switch (term.type) {
- case PatternTerm::TypeAssertionBOL:
- assertionBOL(term.inputPosition - currentCountAlreadyChecked);
- break;
-
- case PatternTerm::TypeAssertionEOL:
- assertionEOL(term.inputPosition - currentCountAlreadyChecked);
- break;
-
- case PatternTerm::TypeAssertionWordBoundary:
- assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked);
- break;
-
- case PatternTerm::TypePatternCharacter:
- atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
- break;
-
- case PatternTerm::TypeCharacterClass:
- atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
- break;
-
- case PatternTerm::TypeBackReference:
- atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
- break;
-
- case PatternTerm::TypeForwardReference:
- break;
-
- case PatternTerm::TypeParenthesesSubpattern: {
- unsigned disjunctionAlreadyCheckedCount = 0;
- if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
- if (term.quantityType == QuantifierFixedCount) {
- disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
- unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation);
- emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize);
- atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
- } else {
- unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
- emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
- atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
- }
- } else {
- unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
- emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
- atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
- }
- break;
- }
-
- case PatternTerm::TypeParentheticalAssertion: {
- unsigned alternativeFrameLocation = term.frameLocation + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
-
- atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation);
- emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
- atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType);
- break;
- }
- }
- }
- }
- }
-
-private:
- RegexPattern& m_pattern;
- ByteDisjunction* m_bodyDisjunction;
- unsigned m_currentAlternativeIndex;
- Vector<ParenthesesStackEntry> m_parenthesesStack;
- Vector<ByteDisjunction*> m_allParenthesesInfo;
-};
-
-
-BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline)
-{
- RegexPattern pattern(ignoreCase, multiline);
-
- if ((error = compileRegex(patternString, pattern)))
- return 0;
-
- numSubpatterns = pattern.m_numSubpatterns;
-
- return ByteCompiler(pattern).compile();
-}
-
-int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
-{
- return Interpreter(regex, output, input, start, length).interpret();
-}
-
-
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);
-
-
-} }
-
-#endif