X-Git-Url: https://git.saurik.com/apple/javascriptcore.git/blobdiff_plain/b5422865f473faf3977f31b96a635c4c8c4ede09..9dae56ea45a0f5f8136a5c93d6f3a7f99399ca73:/wrec/CharacterClassConstructor.cpp diff --git a/wrec/CharacterClassConstructor.cpp b/wrec/CharacterClassConstructor.cpp new file mode 100644 index 0000000..06f4262 --- /dev/null +++ b/wrec/CharacterClassConstructor.cpp @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2008, 2009 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "CharacterClassConstructor.h" + +#if ENABLE(WREC) + +#include "pcre_internal.h" +#include + +using namespace WTF; + +namespace JSC { namespace WREC { + +void CharacterClassConstructor::addSorted(Vector& matches, UChar ch) +{ + unsigned pos = 0; + unsigned range = matches.size(); + + // binary chop, find position to insert char. + while (range) { + unsigned index = range >> 1; + + int val = matches[pos+index] - ch; + if (!val) + return; + else if (val > 0) + range = index; + else { + pos += (index+1); + range -= (index+1); + } + } + + if (pos == matches.size()) + matches.append(ch); + else + matches.insert(pos, ch); +} + +void CharacterClassConstructor::addSortedRange(Vector& ranges, UChar lo, UChar hi) +{ + unsigned end = ranges.size(); + + // Simple linear scan - I doubt there are that many ranges anyway... + // feel free to fix this with something faster (eg binary chop). + for (unsigned i = 0; i < end; ++i) { + // does the new range fall before the current position in the array + if (hi < ranges[i].begin) { + // optional optimization: concatenate appending ranges? - may not be worthwhile. + if (hi == (ranges[i].begin - 1)) { + ranges[i].begin = lo; + return; + } + CharacterRange r = {lo, hi}; + ranges.insert(i, r); + return; + } + // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining + // If the new range start at or before the end of the last range, then the overlap (if it starts one after the + // end of the last range they concatenate, which is just as good. + if (lo <= (ranges[i].end + 1)) { + // found an intersect! we'll replace this entry in the array. + ranges[i].begin = std::min(ranges[i].begin, lo); + ranges[i].end = std::max(ranges[i].end, hi); + + // now check if the new range can subsume any subsequent ranges. + unsigned next = i+1; + // each iteration of the loop we will either remove something from the list, or break the loop. + while (next < ranges.size()) { + if (ranges[next].begin <= (ranges[i].end + 1)) { + // the next entry now overlaps / concatenates this one. + ranges[i].end = std::max(ranges[i].end, ranges[next].end); + ranges.remove(next); + } else + break; + } + + return; + } + } + + // CharacterRange comes after all existing ranges. + CharacterRange r = {lo, hi}; + ranges.append(r); +} + +void CharacterClassConstructor::put(UChar ch) +{ + // Parsing a regular expression like [a-z], we start in an initial empty state: + // ((m_charBuffer == -1) && !m_isPendingDash) + // When buffer the 'a' sice it may be (and is in this case) part of a range: + // ((m_charBuffer != -1) && !m_isPendingDash) + // Having parsed the hyphen we then record that the dash is also pending: + // ((m_charBuffer != -1) && m_isPendingDash) + // The next change will always take us back to the initial state - either because + // a complete range has been parsed (such as [a-z]), or because a flush is forced, + // due to an early end in the regexp ([a-]), or a character class escape being added + // ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted. + ASSERT(!((m_charBuffer == -1) && m_isPendingDash)); + + if (m_charBuffer != -1) { + if (m_isPendingDash) { + // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range. + UChar lo = m_charBuffer; + UChar hi = ch; + // Reset back to the inital state. + m_charBuffer = -1; + m_isPendingDash = false; + + // This is an error, detected lazily. Do not proceed. + if (lo > hi) { + m_isUpsideDown = true; + return; + } + + if (lo <= 0x7f) { + char asciiLo = lo; + char asciiHi = std::min(hi, (UChar)0x7f); + addSortedRange(m_ranges, lo, asciiHi); + + if (m_isCaseInsensitive) { + if ((asciiLo <= 'Z') && (asciiHi >= 'A')) + addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); + if ((asciiLo <= 'z') && (asciiHi >= 'a')) + addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); + } + } + if (hi >= 0x80) { + UChar unicodeCurr = std::max(lo, (UChar)0x80); + addSortedRange(m_rangesUnicode, unicodeCurr, hi); + + if (m_isCaseInsensitive) { + // we're going to scan along, updating the start of the range + while (unicodeCurr <= hi) { + // Spin forwards over any characters that don't have two cases. + for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) { + // if this was the last character in the range, we're done. + if (unicodeCurr == hi) + return; + } + // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case. + UChar rangeStart = unicodeCurr; + UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr); + + // If unicodeCurr is not yet hi, check the next char in the range. If it also has another case, + // and if it's other case value is one greater then the othercase value for the current last + // character included in the range, we can include next into the range. + while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) { + // increment unicodeCurr; it points to the end of the range. + // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value. + ++unicodeCurr; + ++otherCurr; + } + + // otherChar is the last in the range of other case chars, calculate offset to get back to the start. + addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr); + + // unicodeCurr has been added, move on to the next char. + ++unicodeCurr; + } + } + } + } else if (ch == '-') + // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range. + m_isPendingDash = true; + else { + // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'. + flush(); + m_charBuffer = ch; + } + } else + // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character + // (the hyphen not treated as a special character in this case, same handling for any char). + m_charBuffer = ch; +} + +// When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range. +// When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate +// array (either ascii or unicode). +// If the pattern is case insensitive we add entries for both cases. +void CharacterClassConstructor::flush() +{ + if (m_charBuffer != -1) { + if (m_charBuffer <= 0x7f) { + if (m_isCaseInsensitive && isASCIILower(m_charBuffer)) + addSorted(m_matches, toASCIIUpper(m_charBuffer)); + addSorted(m_matches, m_charBuffer); + if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer)) + addSorted(m_matches, toASCIILower(m_charBuffer)); + } else { + addSorted(m_matchesUnicode, m_charBuffer); + if (m_isCaseInsensitive) { + int other = jsc_pcre_ucp_othercase(m_charBuffer); + if (other != -1) + addSorted(m_matchesUnicode, other); + } + } + m_charBuffer = -1; + } + + if (m_isPendingDash) { + addSorted(m_matches, '-'); + m_isPendingDash = false; + } +} + +void CharacterClassConstructor::append(const CharacterClass& other) +{ + // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]). + // Need to check the spec, really, but think this matches PCRE behaviour. + flush(); + + if (other.numMatches) { + for (size_t i = 0; i < other.numMatches; ++i) + addSorted(m_matches, other.matches[i]); + } + if (other.numRanges) { + for (size_t i = 0; i < other.numRanges; ++i) + addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end); + } + if (other.numMatchesUnicode) { + for (size_t i = 0; i < other.numMatchesUnicode; ++i) + addSorted(m_matchesUnicode, other.matchesUnicode[i]); + } + if (other.numRangesUnicode) { + for (size_t i = 0; i < other.numRangesUnicode; ++i) + addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end); + } +} + +} } // namespace JSC::WREC + +#endif // ENABLE(WREC)