/*
- * Copyright (C) 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2009, 2013 Apple Inc. All rights reserved.
* Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
*
* Redistribution and use in source and binary forms, with or without
#include "YarrPattern.h"
#include "Yarr.h"
+#include "YarrCanonicalizeUCS2.h"
#include "YarrParser.h"
#include <wtf/Vector.h>
void putChar(UChar ch)
{
+ // Handle ascii cases.
if (ch <= 0x7f) {
if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
addSorted(m_matches, toASCIIUpper(ch));
addSorted(m_matches, toASCIILower(ch));
} else
addSorted(m_matches, ch);
- } else {
- UChar upper, lower;
- if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
- addSorted(m_matchesUnicode, upper);
- addSorted(m_matchesUnicode, lower);
- } else
- addSorted(m_matchesUnicode, ch);
+ return;
}
- }
- // returns true if this character has another case, and 'ch' is the upper case form.
- static inline bool isUnicodeUpper(UChar ch)
- {
- return ch != Unicode::toLower(ch);
+ // Simple case, not a case-insensitive match.
+ if (!m_isCaseInsensitive) {
+ addSorted(m_matchesUnicode, ch);
+ return;
+ }
+
+ // Add multiple matches, if necessary.
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique)
+ addSorted(m_matchesUnicode, ch);
+ else
+ putUnicodeIgnoreCase(ch, info);
}
- // returns true if this character has another case, and 'ch' is the lower case form.
- static inline bool isUnicodeLower(UChar ch)
+ void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
{
- return ch != Unicode::toUpper(ch);
+ ASSERT(m_isCaseInsensitive);
+ ASSERT(ch > 0x7f);
+ ASSERT(ch >= info->begin && ch <= info->end);
+ ASSERT(info->type != CanonicalizeUnique);
+ if (info->type == CanonicalizeSet) {
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ } else {
+ addSorted(m_matchesUnicode, ch);
+ addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
+ }
}
void putRange(UChar lo, UChar hi)
addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
}
}
- if (hi >= 0x80) {
- uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
- addSortedRange(m_rangesUnicode, unicodeCurr, hi);
-
- if (m_isCaseInsensitive) {
- while (unicodeCurr <= hi) {
- // If the upper bound of the range (hi) is 0xffff, the increments to
- // unicodeCurr in this loop may take it to 0x10000. This is fine
- // (if so we won't re-enter the loop, since the loop condition above
- // will definitely fail) - but this does mean we cannot use a UChar
- // to represent unicodeCurr, we must use a 32-bit value instead.
- ASSERT(unicodeCurr <= 0xffff);
-
- if (isUnicodeUpper(unicodeCurr)) {
- UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
- UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
- while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
- lowerCaseRangeEnd++;
- addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
- } else if (isUnicodeLower(unicodeCurr)) {
- UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
- UChar upperCaseRangeEnd = upperCaseRangeBegin;
- while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
- upperCaseRangeEnd++;
- addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
- } else
- ++unicodeCurr;
- }
+ if (hi <= 0x7f)
+ return;
+
+ lo = std::max(lo, (UChar)0x80);
+ addSortedRange(m_rangesUnicode, lo, hi);
+
+ if (!m_isCaseInsensitive)
+ return;
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(lo);
+ while (true) {
+ // Handle the range [lo .. end]
+ UChar end = std::min<UChar>(info->end, hi);
+
+ switch (info->type) {
+ case CanonicalizeUnique:
+ // Nothing to do - no canonical equivalents.
+ break;
+ case CanonicalizeSet: {
+ UChar ch;
+ for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
+ addSorted(m_matchesUnicode, ch);
+ break;
}
- }
+ case CanonicalizeRangeLo:
+ addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
+ break;
+ case CanonicalizeRangeHi:
+ addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
+ break;
+ case CanonicalizeAlternatingAligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (lo & 1)
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (!(end & 1))
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ case CanonicalizeAlternatingUnaligned:
+ // Use addSortedRange since there is likely an abutting range to combine with.
+ if (!(lo & 1))
+ addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
+ if (end & 1)
+ addSortedRange(m_rangesUnicode, end + 1, end + 1);
+ break;
+ }
+
+ if (hi == end)
+ return;
+
+ ++info;
+ lo = info->begin;
+ };
+
}
- CharacterClass* charClass()
+ PassOwnPtr<CharacterClass> charClass()
{
- CharacterClass* characterClass = new CharacterClass(0);
-
- characterClass->m_matches.append(m_matches);
- characterClass->m_ranges.append(m_ranges);
- characterClass->m_matchesUnicode.append(m_matchesUnicode);
- characterClass->m_rangesUnicode.append(m_rangesUnicode);
+ OwnPtr<CharacterClass> characterClass = adoptPtr(new CharacterClass);
- reset();
+ characterClass->m_matches.swap(m_matches);
+ characterClass->m_ranges.swap(m_ranges);
+ characterClass->m_matchesUnicode.swap(m_matchesUnicode);
+ characterClass->m_rangesUnicode.swap(m_rangesUnicode);
- return characterClass;
+ return characterClass.release();
}
private:
, m_characterClassConstructor(pattern.m_ignoreCase)
, m_invertParentheticalAssertion(false)
{
- m_pattern.m_body = new PatternDisjunction();
- m_alternative = m_pattern.m_body->addNewAlternative();
- m_pattern.m_disjunctions.append(m_pattern.m_body);
+ OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction);
+ m_pattern.m_body = body.get();
+ m_alternative = body->addNewAlternative();
+ m_pattern.m_disjunctions.append(body.release());
}
~YarrPatternConstructor()
m_pattern.reset();
m_characterClassConstructor.reset();
- m_pattern.m_body = new PatternDisjunction();
- m_alternative = m_pattern.m_body->addNewAlternative();
- m_pattern.m_disjunctions.append(m_pattern.m_body);
+ OwnPtr<PatternDisjunction> body = adoptPtr(new PatternDisjunction);
+ m_pattern.m_body = body.get();
+ m_alternative = body->addNewAlternative();
+ m_pattern.m_disjunctions.append(body.release());
}
void assertionBOL()
{
// We handle case-insensitive checking of unicode characters which do have both
// cases by handling them as if they were defined using a CharacterClass.
- if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
- atomCharacterClassBegin();
- atomCharacterClassAtom(ch);
- atomCharacterClassEnd();
- } else
+ if (!m_pattern.m_ignoreCase || isASCII(ch)) {
m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ UCS2CanonicalizationRange* info = rangeInfoFor(ch);
+ if (info->type == CanonicalizeUnique) {
+ m_alternative->m_terms.append(PatternTerm(ch));
+ return;
+ }
+
+ m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
+ OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass();
+ m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false));
+ m_pattern.m_userCharacterClasses.append(newCharacterClass.release());
}
void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
break;
default:
- ASSERT_NOT_REACHED();
+ RELEASE_ASSERT_NOT_REACHED();
}
}
void atomCharacterClassEnd()
{
- CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
- m_pattern.m_userCharacterClasses.append(newCharacterClass);
- m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
+ OwnPtr<CharacterClass> newCharacterClass = m_characterClassConstructor.charClass();
+ m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
+ m_pattern.m_userCharacterClasses.append(newCharacterClass.release());
}
void atomParenthesesSubpatternBegin(bool capture = true)
if (capture)
m_pattern.m_numSubpatterns++;
- PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
- m_pattern.m_disjunctions.append(parenthesesDisjunction);
- m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
+ OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative));
+ m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
m_alternative = parenthesesDisjunction->addNewAlternative();
+ m_pattern.m_disjunctions.append(parenthesesDisjunction.release());
}
void atomParentheticalAssertionBegin(bool invert = false)
{
- PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
- m_pattern.m_disjunctions.append(parenthesesDisjunction);
- m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
+ OwnPtr<PatternDisjunction> parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative));
+ m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
m_alternative = parenthesesDisjunction->addNewAlternative();
m_invertParentheticalAssertion = invert;
+ m_pattern.m_disjunctions.append(parenthesesDisjunction.release());
}
void atomParenthesesEnd()
// skip alternatives with m_startsWithBOL set true.
PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
{
- PatternDisjunction* newDisjunction = 0;
+ OwnPtr<PatternDisjunction> newDisjunction;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
- PatternAlternative* alternative = disjunction->m_alternatives[alt];
+ PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
if (!newDisjunction) {
- newDisjunction = new PatternDisjunction();
+ newDisjunction = adoptPtr(new PatternDisjunction());
newDisjunction->m_parent = disjunction->m_parent;
}
PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
+ newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size());
for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
}
}
- if (newDisjunction)
- m_pattern.m_disjunctions.append(newDisjunction);
- return newDisjunction;
+ if (!newDisjunction)
+ return 0;
+
+ PatternDisjunction* copiedDisjunction = newDisjunction.get();
+ m_pattern.m_disjunctions.append(newDisjunction.release());
+ return copiedDisjunction;
}
PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
- // For any assertion with a zero minimum, not matching is valid and has no effect,
- // remove it. Otherwise, we need to match as least once, but there is no point
- // matching more than once, so remove the quantifier. It is not entirely clear
- // from the spec whether or not this behavior is correct, but I believe this
- // matches Firefox. :-/
if (term.type == PatternTerm::TypeParentheticalAssertion) {
+ // If an assertion is quantified with a minimum count of zero, it can simply be removed.
+ // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
+ // results in any input being consumed, however the continuation passed to the assertion
+ // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
+ // reject all zero length matches (see step 2.1). A match from the continuation of the
+ // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
+ // this is that matches from the assertion are not required, and won't be accepted anyway,
+ // so no need to ever run it.
if (!min)
m_alternative->removeLastTerm();
+ // We never need to run an assertion more than once. Subsequent interations will be run
+ // with the same start index (since assertions are non-capturing) and the same captures
+ // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
+ // same result and captures. If the first match succeeds then the subsequent (min - 1)
+ // matches will too. Any additional optional matches will fail (on the same basis as the
+ // minimum zero quantified assertions, above), but this will still result in a match.
return;
}
unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
{
alternative->m_hasFixedSize = true;
- unsigned currentInputPosition = initialInputPosition;
+ Checked<unsigned> currentInputPosition = initialInputPosition;
for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
PatternTerm& term = alternative->m_terms[i];
case PatternTerm::TypeAssertionBOL:
case PatternTerm::TypeAssertionEOL:
case PatternTerm::TypeAssertionWordBoundary:
- term.inputPosition = currentInputPosition;
+ term.inputPosition = currentInputPosition.unsafeGet();
break;
case PatternTerm::TypeBackReference:
- term.inputPosition = currentInputPosition;
+ term.inputPosition = currentInputPosition.unsafeGet();
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
alternative->m_hasFixedSize = false;
break;
case PatternTerm::TypePatternCharacter:
- term.inputPosition = currentInputPosition;
+ term.inputPosition = currentInputPosition.unsafeGet();
if (term.quantityType != QuantifierFixedCount) {
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
break;
case PatternTerm::TypeCharacterClass:
- term.inputPosition = currentInputPosition;
+ term.inputPosition = currentInputPosition.unsafeGet();
if (term.quantityType != QuantifierFixedCount) {
term.frameLocation = currentCallFrameSize;
currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
if (term.quantityCount == 1 && !term.parentheses.isCopy) {
if (term.quantityType != QuantifierFixedCount)
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
// If quantity is fixed, then pre-check its minimum size.
if (term.quantityType == QuantifierFixedCount)
currentInputPosition += term.parentheses.disjunction->m_minimumSize;
- term.inputPosition = currentInputPosition;
+ term.inputPosition = currentInputPosition.unsafeGet();
} else if (term.parentheses.isTerminal) {
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
- term.inputPosition = currentInputPosition;
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
+ term.inputPosition = currentInputPosition.unsafeGet();
} else {
- term.inputPosition = currentInputPosition;
- setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition);
+ term.inputPosition = currentInputPosition.unsafeGet();
+ setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet());
currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
}
// Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
break;
case PatternTerm::TypeParentheticalAssertion:
- term.inputPosition = currentInputPosition;
+ term.inputPosition = currentInputPosition.unsafeGet();
term.frameLocation = currentCallFrameSize;
- currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition);
+ currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet());
break;
case PatternTerm::TypeDotStarEnclosure:
}
}
- alternative->m_minimumSize = currentInputPosition - initialInputPosition;
+ alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
return currentCallFrameSize;
}
bool hasFixedSize = true;
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
- PatternAlternative* alternative = disjunction->m_alternatives[alt];
+ PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
- minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
- maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
+ minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
+ maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
hasFixedSize &= alternative->m_hasFixedSize;
}
if (m_pattern.m_numSubpatterns)
return;
- Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+ Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives;
for (size_t i = 0; i < alternatives.size(); ++i) {
Vector<PatternTerm>& terms = alternatives[i]->m_terms;
if (terms.size()) {
if (loopDisjunction) {
// Move alternatives from loopDisjunction to disjunction
for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
- disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
+ disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release());
loopDisjunction->m_alternatives.clear();
}
if (term.type == PatternTerm::TypeParenthesesSubpattern) {
PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
- if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
+ if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
return true;
}
}
// beginning and the end of the match.
void optimizeDotStarWrappedExpressions()
{
- Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+ Vector<OwnPtr<PatternAlternative> >& alternatives = m_pattern.m_body->m_alternatives;
if (alternatives.size() != 1)
return;
- PatternAlternative* alternative = alternatives[0];
+ PatternAlternative* alternative = alternatives[0].get();
Vector<PatternTerm>& terms = alternative->m_terms;
if (terms.size() >= 3) {
bool startsWithBOL = false;
bool m_invertParentheticalAssertion;
};
-const char* YarrPattern::compile(const UString& patternString)
+const char* YarrPattern::compile(const String& patternString)
{
YarrPatternConstructor constructor(*this);
return 0;
}
-YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error)
+YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error)
: m_ignoreCase(ignoreCase)
, m_multiline(multiline)
, m_containsBackreferences(false)