X-Git-Url: https://git.saurik.com/apple/javascriptcore.git/blobdiff_plain/1df5f87f1309a8daa30dabdee855f48ae40d14ab..2d39b0e377c0896910ee49ae70082ba665faf986:/yarr/YarrPattern.cpp diff --git a/yarr/YarrPattern.cpp b/yarr/YarrPattern.cpp index bf150c1..56dd5e5 100644 --- a/yarr/YarrPattern.cpp +++ b/yarr/YarrPattern.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Apple Inc. All rights reserved. + * Copyright (C) 2009, 2013 Apple Inc. All rights reserved. * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged * * Redistribution and use in source and binary forms, with or without @@ -28,6 +28,7 @@ #include "YarrPattern.h" #include "Yarr.h" +#include "YarrCanonicalizeUCS2.h" #include "YarrParser.h" #include @@ -66,32 +67,43 @@ public: void putChar(UChar ch) { + // Handle ascii cases. if (ch <= 0x7f) { if (m_isCaseInsensitive && isASCIIAlpha(ch)) { addSorted(m_matches, toASCIIUpper(ch)); addSorted(m_matches, toASCIILower(ch)); } else addSorted(m_matches, ch); - } else { - UChar upper, lower; - if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { - addSorted(m_matchesUnicode, upper); - addSorted(m_matchesUnicode, lower); - } else - addSorted(m_matchesUnicode, ch); + return; } - } - // returns true if this character has another case, and 'ch' is the upper case form. - static inline bool isUnicodeUpper(UChar ch) - { - return ch != Unicode::toLower(ch); + // Simple case, not a case-insensitive match. + if (!m_isCaseInsensitive) { + addSorted(m_matchesUnicode, ch); + return; + } + + // Add multiple matches, if necessary. + const UCS2CanonicalizationRange* info = rangeInfoFor(ch); + if (info->type == CanonicalizeUnique) + addSorted(m_matchesUnicode, ch); + else + putUnicodeIgnoreCase(ch, info); } - // returns true if this character has another case, and 'ch' is the lower case form. - static inline bool isUnicodeLower(UChar ch) + void putUnicodeIgnoreCase(UChar ch, const UCS2CanonicalizationRange* info) { - return ch != Unicode::toUpper(ch); + ASSERT(m_isCaseInsensitive); + ASSERT(ch > 0x7f); + ASSERT(ch >= info->begin && ch <= info->end); + ASSERT(info->type != CanonicalizeUnique); + if (info->type == CanonicalizeSet) { + for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + addSorted(m_matchesUnicode, ch); + } else { + addSorted(m_matchesUnicode, ch); + addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); + } } void putRange(UChar lo, UChar hi) @@ -108,50 +120,71 @@ public: addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); } } - if (hi >= 0x80) { - uint32_t unicodeCurr = std::max(lo, (UChar)0x80); - addSortedRange(m_rangesUnicode, unicodeCurr, hi); - - if (m_isCaseInsensitive) { - while (unicodeCurr <= hi) { - // If the upper bound of the range (hi) is 0xffff, the increments to - // unicodeCurr in this loop may take it to 0x10000. This is fine - // (if so we won't re-enter the loop, since the loop condition above - // will definitely fail) - but this does mean we cannot use a UChar - // to represent unicodeCurr, we must use a 32-bit value instead. - ASSERT(unicodeCurr <= 0xffff); - - if (isUnicodeUpper(unicodeCurr)) { - UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); - UChar lowerCaseRangeEnd = lowerCaseRangeBegin; - while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) - lowerCaseRangeEnd++; - addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); - } else if (isUnicodeLower(unicodeCurr)) { - UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); - UChar upperCaseRangeEnd = upperCaseRangeBegin; - while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) - upperCaseRangeEnd++; - addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); - } else - ++unicodeCurr; - } + if (hi <= 0x7f) + return; + + lo = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, lo, hi); + + if (!m_isCaseInsensitive) + return; + + const UCS2CanonicalizationRange* info = rangeInfoFor(lo); + while (true) { + // Handle the range [lo .. end] + UChar end = std::min(info->end, hi); + + switch (info->type) { + case CanonicalizeUnique: + // Nothing to do - no canonical equivalents. + break; + case CanonicalizeSet: { + UChar ch; + for (const uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) + addSorted(m_matchesUnicode, ch); + break; } - } + case CanonicalizeRangeLo: + addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); + break; + case CanonicalizeRangeHi: + addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); + break; + case CanonicalizeAlternatingAligned: + // Use addSortedRange since there is likely an abutting range to combine with. + if (lo & 1) + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); + if (!(end & 1)) + addSortedRange(m_rangesUnicode, end + 1, end + 1); + break; + case CanonicalizeAlternatingUnaligned: + // Use addSortedRange since there is likely an abutting range to combine with. + if (!(lo & 1)) + addSortedRange(m_rangesUnicode, lo - 1, lo - 1); + if (end & 1) + addSortedRange(m_rangesUnicode, end + 1, end + 1); + break; + } + + if (hi == end) + return; + + ++info; + lo = info->begin; + }; + } - CharacterClass* charClass() + PassOwnPtr charClass() { - CharacterClass* characterClass = new CharacterClass(0); - - characterClass->m_matches.append(m_matches); - characterClass->m_ranges.append(m_ranges); - characterClass->m_matchesUnicode.append(m_matchesUnicode); - characterClass->m_rangesUnicode.append(m_rangesUnicode); + OwnPtr characterClass = adoptPtr(new CharacterClass); - reset(); + characterClass->m_matches.swap(m_matches); + characterClass->m_ranges.swap(m_ranges); + characterClass->m_matchesUnicode.swap(m_matchesUnicode); + characterClass->m_rangesUnicode.swap(m_rangesUnicode); - return characterClass; + return characterClass.release(); } private: @@ -241,9 +274,10 @@ public: , m_characterClassConstructor(pattern.m_ignoreCase) , m_invertParentheticalAssertion(false) { - m_pattern.m_body = new PatternDisjunction(); - m_alternative = m_pattern.m_body->addNewAlternative(); - m_pattern.m_disjunctions.append(m_pattern.m_body); + OwnPtr body = adoptPtr(new PatternDisjunction); + m_pattern.m_body = body.get(); + m_alternative = body->addNewAlternative(); + m_pattern.m_disjunctions.append(body.release()); } ~YarrPatternConstructor() @@ -255,9 +289,10 @@ public: m_pattern.reset(); m_characterClassConstructor.reset(); - m_pattern.m_body = new PatternDisjunction(); - m_alternative = m_pattern.m_body->addNewAlternative(); - m_pattern.m_disjunctions.append(m_pattern.m_body); + OwnPtr body = adoptPtr(new PatternDisjunction); + m_pattern.m_body = body.get(); + m_alternative = body->addNewAlternative(); + m_pattern.m_disjunctions.append(body.release()); } void assertionBOL() @@ -282,12 +317,21 @@ public: { // We handle case-insensitive checking of unicode characters which do have both // cases by handling them as if they were defined using a CharacterClass. - if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { - atomCharacterClassBegin(); - atomCharacterClassAtom(ch); - atomCharacterClassEnd(); - } else + if (!m_pattern.m_ignoreCase || isASCII(ch)) { m_alternative->m_terms.append(PatternTerm(ch)); + return; + } + + const UCS2CanonicalizationRange* info = rangeInfoFor(ch); + if (info->type == CanonicalizeUnique) { + m_alternative->m_terms.append(PatternTerm(ch)); + return; + } + + m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); + OwnPtr newCharacterClass = m_characterClassConstructor.charClass(); + m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), false)); + m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); } void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) @@ -341,15 +385,15 @@ public: break; default: - ASSERT_NOT_REACHED(); + RELEASE_ASSERT_NOT_REACHED(); } } void atomCharacterClassEnd() { - CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); - m_pattern.m_userCharacterClasses.append(newCharacterClass); - m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass)); + OwnPtr newCharacterClass = m_characterClassConstructor.charClass(); + m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass)); + m_pattern.m_userCharacterClasses.append(newCharacterClass.release()); } void atomParenthesesSubpatternBegin(bool capture = true) @@ -358,19 +402,19 @@ public: if (capture) m_pattern.m_numSubpatterns++; - PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); - m_pattern.m_disjunctions.append(parenthesesDisjunction); - m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false)); + OwnPtr parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false)); m_alternative = parenthesesDisjunction->addNewAlternative(); + m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); } void atomParentheticalAssertionBegin(bool invert = false) { - PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); - m_pattern.m_disjunctions.append(parenthesesDisjunction); - m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert)); + OwnPtr parenthesesDisjunction = adoptPtr(new PatternDisjunction(m_alternative)); + m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert)); m_alternative = parenthesesDisjunction->addNewAlternative(); m_invertParentheticalAssertion = invert; + m_pattern.m_disjunctions.append(parenthesesDisjunction.release()); } void atomParenthesesEnd() @@ -435,23 +479,27 @@ public: // skip alternatives with m_startsWithBOL set true. PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) { - PatternDisjunction* newDisjunction = 0; + OwnPtr newDisjunction; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - PatternAlternative* alternative = disjunction->m_alternatives[alt]; + PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { if (!newDisjunction) { - newDisjunction = new PatternDisjunction(); + newDisjunction = adoptPtr(new PatternDisjunction()); newDisjunction->m_parent = disjunction->m_parent; } PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); + newAlternative->m_terms.reserveInitialCapacity(alternative->m_terms.size()); for (unsigned i = 0; i < alternative->m_terms.size(); ++i) newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL)); } } - if (newDisjunction) - m_pattern.m_disjunctions.append(newDisjunction); - return newDisjunction; + if (!newDisjunction) + return 0; + + PatternDisjunction* copiedDisjunction = newDisjunction.get(); + m_pattern.m_disjunctions.append(newDisjunction.release()); + return copiedDisjunction; } PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) @@ -478,14 +526,23 @@ public: ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); - // For any assertion with a zero minimum, not matching is valid and has no effect, - // remove it. Otherwise, we need to match as least once, but there is no point - // matching more than once, so remove the quantifier. It is not entirely clear - // from the spec whether or not this behavior is correct, but I believe this - // matches Firefox. :-/ if (term.type == PatternTerm::TypeParentheticalAssertion) { + // If an assertion is quantified with a minimum count of zero, it can simply be removed. + // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never + // results in any input being consumed, however the continuation passed to the assertion + // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will + // reject all zero length matches (see step 2.1). A match from the continuation of the + // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all + // this is that matches from the assertion are not required, and won't be accepted anyway, + // so no need to ever run it. if (!min) m_alternative->removeLastTerm(); + // We never need to run an assertion more than once. Subsequent interations will be run + // with the same start index (since assertions are non-capturing) and the same captures + // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the + // same result and captures. If the first match succeeds then the subsequent (min - 1) + // matches will too. Any additional optional matches will fail (on the same basis as the + // minimum zero quantified assertions, above), but this will still result in a match. return; } @@ -604,11 +661,13 @@ public: bool hasFixedSize = true; for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { - PatternAlternative* alternative = disjunction->m_alternatives[alt]; + PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition); - minimumInputSize = min(minimumInputSize, alternative->m_minimumSize); - maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize); + minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); + maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); hasFixedSize &= alternative->m_hasFixedSize; + if (alternative->m_minimumSize > INT_MAX) + m_pattern.m_containsUnsignedLengthPattern = true; } ASSERT(minimumInputSize != UINT_MAX); @@ -639,7 +698,7 @@ public: if (m_pattern.m_numSubpatterns) return; - Vector& alternatives = m_pattern.m_body->m_alternatives; + Vector>& alternatives = m_pattern.m_body->m_alternatives; for (size_t i = 0; i < alternatives.size(); ++i) { Vector& terms = alternatives[i]->m_terms; if (terms.size()) { @@ -674,7 +733,7 @@ public: if (loopDisjunction) { // Move alternatives from loopDisjunction to disjunction for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) - disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]); + disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release()); loopDisjunction->m_alternatives.clear(); } @@ -693,7 +752,7 @@ public: if (term.type == PatternTerm::TypeParenthesesSubpattern) { PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { - if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1)) + if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1)) return true; } } @@ -709,11 +768,11 @@ public: // beginning and the end of the match. void optimizeDotStarWrappedExpressions() { - Vector& alternatives = m_pattern.m_body->m_alternatives; + Vector>& alternatives = m_pattern.m_body->m_alternatives; if (alternatives.size() != 1) return; - PatternAlternative* alternative = alternatives[0]; + PatternAlternative* alternative = alternatives[0].get(); Vector& terms = alternative->m_terms; if (terms.size() >= 3) { bool startsWithBOL = false; @@ -769,7 +828,7 @@ private: bool m_invertParentheticalAssertion; }; -const char* YarrPattern::compile(const UString& patternString) +const char* YarrPattern::compile(const String& patternString) { YarrPatternConstructor constructor(*this); @@ -802,11 +861,12 @@ const char* YarrPattern::compile(const UString& patternString) return 0; } -YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error) +YarrPattern::YarrPattern(const String& pattern, bool ignoreCase, bool multiline, const char** error) : m_ignoreCase(ignoreCase) , m_multiline(multiline) , m_containsBackreferences(false) , m_containsBOL(false) + , m_containsUnsignedLengthPattern(false) , m_numSubpatterns(0) , m_maxBackReference(0) , newlineCached(0)