#include "YarrInterpreter.h"
#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
#include <wtf/BumpPointerAllocator.h>
-
-#ifndef NDEBUG
-#include <stdio.h>
-#endif
+#include <wtf/DataLog.h>
+#include <wtf/text/CString.h>
+#include <wtf/text/WTFString.h>
using namespace WTF;
namespace JSC { namespace Yarr {
+template<typename CharType>
class Interpreter {
public:
struct ParenthesesDisjunctionContext;
static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
{
- ASSERT(backTrack->matchAmount);
- ASSERT(backTrack->lastContext);
+ RELEASE_ASSERT(backTrack->matchAmount);
+ RELEASE_ASSERT(backTrack->lastContext);
backTrack->lastContext = backTrack->lastContext->next;
--backTrack->matchAmount;
}
{
size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
allocatorPool = allocatorPool->ensureCapacity(size);
- if (!allocatorPool)
- CRASH();
- return new(allocatorPool->alloc(size)) DisjunctionContext();
+ RELEASE_ASSERT(allocatorPool);
+ return new (allocatorPool->alloc(size)) DisjunctionContext();
}
void freeDisjunctionContext(DisjunctionContext* context)
struct ParenthesesDisjunctionContext
{
- ParenthesesDisjunctionContext(int* output, ByteTerm& term)
+ ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
: next(0)
{
unsigned firstSubpatternId = term.atom.subpatternId;
for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
- output[(firstSubpatternId << 1) + i] = -1;
+ output[(firstSubpatternId << 1) + i] = offsetNoMatch;
}
- new(getDisjunctionContext(term)) DisjunctionContext();
+ new (getDisjunctionContext(term)) DisjunctionContext();
}
void* operator new(size_t, void* where)
return where;
}
- void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
+ void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
{
for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
}
ParenthesesDisjunctionContext* next;
- int subpatternBackup[1];
+ unsigned subpatternBackup[1];
};
- ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
+ ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
{
- size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
+ size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(unsigned) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + static_cast<size_t>(disjunction->m_frameSize) * sizeof(uintptr_t);
allocatorPool = allocatorPool->ensureCapacity(size);
- if (!allocatorPool)
- CRASH();
- return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
+ RELEASE_ASSERT(allocatorPool);
+ return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
}
void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
class InputStream {
public:
- InputStream(const UChar* input, unsigned start, unsigned length)
+ InputStream(const CharType* input, unsigned start, unsigned length)
: input(input)
, pos(start)
, length(length)
return input[pos] | input[pos + 1] << 16;
}
- int readChecked(int position)
+ int readChecked(unsigned negativePositionOffest)
{
- ASSERT(position < 0);
- ASSERT(static_cast<unsigned>(-position) <= pos);
- unsigned p = pos + position;
+ RELEASE_ASSERT(pos >= negativePositionOffest);
+ unsigned p = pos - negativePositionOffest;
ASSERT(p < length);
return input[p];
}
return length;
}
- bool checkInput(int count)
+ bool checkInput(unsigned count)
{
- if ((pos + count) <= length) {
+ if (((pos + count) <= length) && ((pos + count) >= pos)) {
pos += count;
return true;
}
return false;
}
- void uncheckInput(int count)
+ void uncheckInput(unsigned count)
{
+ RELEASE_ASSERT(pos >= count);
pos -= count;
}
- bool atStart(int position)
+ bool atStart(unsigned negativePositionOffest)
{
- return (pos + position) == 0;
+ return pos == negativePositionOffest;
}
- bool atEnd(int position)
+ bool atEnd(unsigned negativePositionOffest)
{
- return (pos + position) == length;
+ RELEASE_ASSERT(pos >= negativePositionOffest);
+ return (pos - negativePositionOffest) == length;
}
- bool isNotAvailableInput(int position)
+ bool isAvailableInput(unsigned offset)
{
- return (pos + position) > length;
+ return (((pos + offset) <= length) && ((pos + offset) >= pos));
}
private:
- const UChar* input;
+ const CharType* input;
unsigned pos;
unsigned length;
};
return false;
}
- bool checkCharacter(int testChar, int inputPosition)
+ bool checkCharacter(int testChar, unsigned negativeInputOffset)
{
- return testChar == input.readChecked(inputPosition);
+ return testChar == input.readChecked(negativeInputOffset);
}
- bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
+ bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
{
- int ch = input.readChecked(inputPosition);
+ int ch = input.readChecked(negativeInputOffset);
return (loChar == ch) || (hiChar == ch);
}
- bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
+ bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
{
- bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
+ bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
return invert ? !match : match;
}
- bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
+ bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
{
- int matchSize = matchEnd - matchBegin;
+ unsigned matchSize = (unsigned)(matchEnd - matchBegin);
if (!input.checkInput(matchSize))
return false;
if (pattern->m_ignoreCase) {
- for (int i = 0; i < matchSize; ++i) {
- int ch = input.reread(matchBegin + i);
-
- int lo = Unicode::toLower(ch);
- int hi = Unicode::toUpper(ch);
-
- if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) {
- input.uncheckInput(matchSize);
- return false;
- }
+ for (unsigned i = 0; i < matchSize; ++i) {
+ int oldCh = input.reread(matchBegin + i);
+ int ch = input.readChecked(negativeInputOffset + matchSize - i);
+
+ if (oldCh == ch)
+ continue;
+
+ // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that
+ // unicode values are never allowed to match against ascii ones.
+ if (isASCII(oldCh) || isASCII(ch)) {
+ if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
+ continue;
+ } else if (areCanonicallyEquivalent(oldCh, ch))
+ continue;
+
+ input.uncheckInput(matchSize);
+ return false;
}
} else {
- for (int i = 0; i < matchSize; ++i) {
- if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
+ for (unsigned i = 0; i < matchSize; ++i) {
+ if (!checkCharacter(input.reread(matchBegin + i), negativeInputOffset + matchSize - i)) {
input.uncheckInput(matchSize);
return false;
}
bool matchAssertionBOL(ByteTerm& term)
{
- return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
+ return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
}
bool matchAssertionEOL(ByteTerm& term)
bool matchAssertionWordBoundary(ByteTerm& term)
{
- bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
+ bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
bool readIsWordchar;
if (term.inputPosition)
readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
case QuantifierNonGreedy:
if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
++backTrack->matchAmount;
- if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
+ if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
return true;
}
input.uncheckInput(backTrack->matchAmount);
case QuantifierNonGreedy:
if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
++backTrack->matchAmount;
- if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
+ if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
return true;
}
input.uncheckInput(backTrack->matchAmount);
switch (term.atom.quantityType) {
case QuantifierFixedCount: {
for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
- if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
+ if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
return false;
}
return true;
case QuantifierGreedy: {
unsigned matchAmount = 0;
while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
- if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
+ if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
input.uncheckInput(1);
break;
}
return true;
}
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
return false;
}
case QuantifierNonGreedy:
if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
++backTrack->matchAmount;
- if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
+ if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
return true;
}
input.uncheckInput(backTrack->matchAmount);
ASSERT(term.type == ByteTerm::TypeBackReference);
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
- int matchBegin = output[(term.atom.subpatternId << 1)];
- int matchEnd = output[(term.atom.subpatternId << 1) + 1];
+ unsigned matchBegin = output[(term.atom.subpatternId << 1)];
+ unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
// If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
// In this case the result of match is empty string like when it references to a parentheses with zero-width match.
// Eg.: /(a\1)/
- if (matchEnd == -1)
+ if (matchEnd == offsetNoMatch)
+ return true;
+
+ if (matchBegin == offsetNoMatch)
return true;
- ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
+ ASSERT(matchBegin <= matchEnd);
if (matchBegin == matchEnd)
return true;
return true;
}
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
return false;
}
ASSERT(term.type == ByteTerm::TypeBackReference);
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
- int matchBegin = output[(term.atom.subpatternId << 1)];
- int matchEnd = output[(term.atom.subpatternId << 1) + 1];
- ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
+ unsigned matchBegin = output[(term.atom.subpatternId << 1)];
+ unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
+
+ if (matchBegin == offsetNoMatch)
+ return false;
+
+ ASSERT(matchBegin <= matchEnd);
if (matchBegin == matchEnd)
return false;
if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
- output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
+ output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
}
return true;
if (term.capture()) {
unsigned subpatternId = term.atom.subpatternId;
- output[(subpatternId << 1)] = -1;
- output[(subpatternId << 1) + 1] = -1;
+ output[(subpatternId << 1)] = offsetNoMatch;
+ output[(subpatternId << 1) + 1] = offsetNoMatch;
}
switch (term.atom.quantityType) {
return true;
case QuantifierNonGreedy:
ASSERT(backTrack->begin != notFound);
+ FALLTHROUGH;
case QuantifierFixedCount:
break;
}
context->term -= term.atom.parenthesesWidth;
return false;
}
+ FALLTHROUGH;
case QuantifierNonGreedy:
if (backTrack->begin == notFound) {
backTrack->begin = input.getPos();
context->term -= term.atom.parenthesesWidth;
return true;
}
+ FALLTHROUGH;
case QuantifierFixedCount:
break;
}
{
// 'Terminal' parentheses are at the end of the regex, and as such a match past end
// should always be returned as a successful match - we should never backtrack to here.
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
return false;
}
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
- if (result == JSRegExpNoMatch) {
- JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
- if (backtrackResult != JSRegExpMatch)
- return backtrackResult;
- } else
+ if (result != JSRegExpNoMatch)
return result;
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
}
}
return JSRegExpMatch;
}
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
return JSRegExpErrorNoMatch;
}
resetMatches(term, context);
freeParenthesesDisjunctionContext(context);
- if (result == JSRegExpNoMatch) {
- JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
- if (backtrackResult != JSRegExpMatch)
- return backtrackResult;
- } else
+ if (result != JSRegExpNoMatch)
return result;
+ JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
+ if (backtrackResult != JSRegExpMatch)
+ return backtrackResult;
}
}
popParenthesesDisjunctionContext(backTrack);
freeParenthesesDisjunctionContext(context);
- return result;
+ if (result != JSRegExpNoMatch)
+ return result;
}
return JSRegExpNoMatch;
}
}
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
return JSRegExpErrorNoMatch;
}
case ByteTerm::TypePatternCharacterOnce:
case ByteTerm::TypePatternCharacterFixed: {
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
- if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
+ if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount))
BACKTRACK();
}
MATCH_NEXT();
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
unsigned matchAmount = 0;
while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
- if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
+ if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
input.uncheckInput(1);
break;
}
case ByteTerm::TypePatternCasedCharacterOnce:
case ByteTerm::TypePatternCasedCharacterFixed: {
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
- if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
+ if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
BACKTRACK();
}
MATCH_NEXT();
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
unsigned matchAmount = 0;
while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
- if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
+ if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
input.uncheckInput(1);
break;
}
}
// We should never fall-through to here.
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
backtrack:
ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
case ByteTerm::TypeSubpatternBegin:
return JSRegExpNoMatch;
case ByteTerm::TypeSubpatternEnd:
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
case ByteTerm::TypeBodyAlternativeBegin:
case ByteTerm::TypeBodyAlternativeDisjunction: {
MATCH_NEXT();
}
case ByteTerm::TypeBodyAlternativeEnd:
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
case ByteTerm::TypeAlternativeBegin:
case ByteTerm::TypeAlternativeDisjunction: {
BACKTRACK();
case ByteTerm::TypeDotStarEnclosure:
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
}
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
return JSRegExpErrorNoMatch;
}
return result;
}
- int interpret()
+ unsigned interpret()
{
- allocatorPool = pattern->m_allocator->startAllocator();
- if (!allocatorPool)
- CRASH();
+ if (!input.isAvailableInput(0))
+ return offsetNoMatch;
- for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
- output[i] = -1;
+ for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
+ output[i << 1] = offsetNoMatch;
+
+ allocatorPool = pattern->m_allocator->startAllocator();
+ RELEASE_ASSERT(allocatorPool);
DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
pattern->m_allocator->stopAllocator();
- // RegExp.cpp currently expects all error to be converted to -1.
- ASSERT((result == JSRegExpMatch) == (output[0] != -1));
+ ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
return output[0];
}
- Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
+ Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
: pattern(pattern)
, output(output)
- , input(inputChar, start, length)
+ , input(input, start, length)
, allocatorPool(0)
, remainingMatchCount(matchLimit)
{
private:
BytecodePattern* pattern;
- int* output;
+ unsigned* output;
InputStream input;
BumpPointerPool* allocatorPool;
unsigned remainingMatchCount;
};
-
-
class ByteCompiler {
struct ParenthesesStackEntry {
unsigned beginTerm;
m_currentAlternativeIndex = 0;
}
- PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
+ std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator)
{
regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
emitDisjunction(m_pattern.m_body);
regexEnd();
- return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
+ return std::make_unique<BytecodePattern>(WTF::move(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator);
}
void checkInput(unsigned count)
m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
}
- void assertionBOL(int inputPosition)
+ void assertionBOL(unsigned inputPosition)
{
m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
}
- void assertionEOL(int inputPosition)
+ void assertionEOL(unsigned inputPosition)
{
m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
}
- void assertionWordBoundary(bool invert, int inputPosition)
+ void assertionWordBoundary(bool invert, unsigned inputPosition)
{
m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
}
- void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ void atomPatternCharacter(UChar ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
{
if (m_pattern.m_ignoreCase) {
- UChar lo = Unicode::toLower(ch);
- UChar hi = Unicode::toUpper(ch);
+ ASSERT(u_tolower(ch) <= 0xFFFF);
+ ASSERT(u_toupper(ch) <= 0xFFFF);
+
+ UChar lo = u_tolower(ch);
+ UChar hi = u_toupper(ch);
if (lo != hi) {
m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
}
- void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
{
m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
}
- void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
{
ASSERT(subpatternId);
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
}
- void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
{
int beginTerm = m_bodyDisjunction->terms.size();
m_currentAlternativeIndex = beginTerm + 1;
}
- void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
{
int beginTerm = m_bodyDisjunction->terms.size();
m_currentAlternativeIndex = beginTerm + 1;
}
- void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
+ void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
{
// Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
// then fix this up at the end! - simplifying this should make it much clearer.
m_currentAlternativeIndex = beginTerm + 1;
}
- void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
+ void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
{
unsigned beginTerm = popParenthesesStack();
closeAlternative(beginTerm + 1);
#ifndef NDEBUG
void dumpDisjunction(ByteDisjunction* disjunction)
{
- printf("ByteDisjunction(%p):\n\t", disjunction);
+ dataLogF("ByteDisjunction(%p):\n\t", disjunction);
for (unsigned i = 0; i < disjunction->terms.size(); ++i)
- printf("{ %d } ", disjunction->terms[i].type);
- printf("\n");
+ dataLogF("{ %d } ", disjunction->terms[i].type);
+ dataLogF("\n");
}
#endif
unsigned subpatternId = parenthesesBegin.atom.subpatternId;
unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
- ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
+ auto parenthesesDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
+
+ unsigned firstTermInParentheses = beginTerm + 1;
+ parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2);
parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
- for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
+ for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
m_bodyDisjunction->terms.shrink(beginTerm);
- m_allParenthesesInfo.append(parenthesesDisjunction);
- m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition));
+ m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
+ m_allParenthesesInfo.append(WTF::move(parenthesesDisjunction));
m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
{
- m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
+ m_bodyDisjunction = std::make_unique<ByteDisjunction>(numSubpatterns, callFrameSize);
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
m_bodyDisjunction->terms[0].frameLocation = 0;
m_currentAlternativeIndex = 0;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
- PatternAlternative* alternative = disjunction->m_alternatives[alt];
+ PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
if (alt) {
if (disjunction == m_pattern.m_body)
switch (term.type) {
case PatternTerm::TypeAssertionBOL:
- assertionBOL(term.inputPosition - currentCountAlreadyChecked);
+ assertionBOL(currentCountAlreadyChecked - term.inputPosition);
break;
case PatternTerm::TypeAssertionEOL:
- assertionEOL(term.inputPosition - currentCountAlreadyChecked);
+ assertionEOL(currentCountAlreadyChecked - term.inputPosition);
break;
case PatternTerm::TypeAssertionWordBoundary:
- assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked);
+ assertionWordBoundary(term.invert(), currentCountAlreadyChecked - term.inputPosition);
break;
case PatternTerm::TypePatternCharacter:
- atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
+ atomPatternCharacter(term.patternCharacter, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
break;
case PatternTerm::TypeCharacterClass:
- atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
+ atomCharacterClass(term.characterClass, term.invert(), currentCountAlreadyChecked- term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
break;
case PatternTerm::TypeBackReference:
- atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
+ atomBackReference(term.backReferenceSubpatternId, currentCountAlreadyChecked - term.inputPosition, term.frameLocation, term.quantityCount, term.quantityType);
break;
case PatternTerm::TypeForwardReference:
else
alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation);
+ atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
} else if (term.parentheses.isTerminal) {
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
+ atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
} else {
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
- atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
+ atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount - delegateEndInputOffset, term.frameLocation, 0);
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
}
private:
YarrPattern& m_pattern;
- OwnPtr<ByteDisjunction> m_bodyDisjunction;
+ std::unique_ptr<ByteDisjunction> m_bodyDisjunction;
unsigned m_currentAlternativeIndex;
Vector<ParenthesesStackEntry> m_parenthesesStack;
- Vector<ByteDisjunction*> m_allParenthesesInfo;
+ Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
};
-PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
+std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
{
return ByteCompiler(pattern).compile(allocator);
}
-int interpret(BytecodePattern* bytecode, const UChar* input, unsigned start, unsigned length, int* output)
+unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
+{
+ if (input.is8Bit())
+ return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
+ return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
+{
+ return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
+}
+
+unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
{
- return Interpreter(bytecode, output, input, start, length).interpret();
+ return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
}
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
-COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
+// These should be the same for both UChar & LChar.
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
+COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
} }