wrec/CharacterClassConstructor.cpp

   1 /*
   2  * Copyright (C) 2008, 2009 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #include "config.h"
  27 #include "CharacterClassConstructor.h"
  28
  29 #if ENABLE(WREC)
  30
  31 #include "pcre_internal.h"
  32 #include <wtf/ASCIICType.h>
  33
  34 using namespace WTF;
  35
  36 namespace JSC { namespace WREC {
  37
  38 void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)
  39 {
  40     unsigned pos = 0;
  41     unsigned range = matches.size();
  42
  43     // binary chop, find position to insert char.
  44     while (range) {
  45         unsigned index = range >> 1;
  46
  47         int val = matches[pos+index] - ch;
  48         if (!val)
  49             return;
  50         else if (val > 0)
  51             range = index;
  52         else {
  53             pos += (index+1);
  54             range -= (index+1);
  55         }
  56     }
  57
  58     if (pos == matches.size())
  59         matches.append(ch);
  60     else
  61         matches.insert(pos, ch);
  62 }
  63
  64 void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
  65 {
  66     unsigned end = ranges.size();
  67
  68     // Simple linear scan - I doubt there are that many ranges anyway...
  69     // feel free to fix this with something faster (eg binary chop).
  70     for (unsigned i = 0; i < end; ++i) {
  71         // does the new range fall before the current position in the array
  72         if (hi < ranges[i].begin) {
  73             // optional optimization: concatenate appending ranges? - may not be worthwhile.
  74             if (hi == (ranges[i].begin - 1)) {
  75                 ranges[i].begin = lo;
  76                 return;
  77             }
  78             CharacterRange r = {lo, hi};
  79             ranges.insert(i, r);
  80             return;
  81         }
  82         // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
  83         // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
  84         // end of the last range they concatenate, which is just as good.
  85         if (lo <= (ranges[i].end + 1)) {
  86             // found an intersect! we'll replace this entry in the array.
  87             ranges[i].begin = std::min(ranges[i].begin, lo);
  88             ranges[i].end = std::max(ranges[i].end, hi);
  89
  90             // now check if the new range can subsume any subsequent ranges.
  91             unsigned next = i+1;
  92             // each iteration of the loop we will either remove something from the list, or break the loop.
  93             while (next < ranges.size()) {
  94                 if (ranges[next].begin <= (ranges[i].end + 1)) {
  95                     // the next entry now overlaps / concatenates this one.
  96                     ranges[i].end = std::max(ranges[i].end, ranges[next].end);
  97                     ranges.remove(next);
  98                 } else
  99                     break;
 100             }
 101
 102             return;
 103         }
 104     }
 105
 106     // CharacterRange comes after all existing ranges.
 107     CharacterRange r = {lo, hi};
 108     ranges.append(r);
 109 }
 110
 111 void CharacterClassConstructor::put(UChar ch)
 112 {
 113     // Parsing a regular expression like [a-z], we start in an initial empty state:
 114     //     ((m_charBuffer == -1) && !m_isPendingDash)
 115     // When buffer the 'a' sice it may be (and is in this case) part of a range:
 116     //     ((m_charBuffer != -1) && !m_isPendingDash)
 117     // Having parsed the hyphen we then record that the dash is also pending:
 118     //     ((m_charBuffer != -1) && m_isPendingDash)
 119     // The next change will always take us back to the initial state - either because
 120     // a complete range has been parsed (such as [a-z]), or because a flush is forced,
 121     // due to an early end in the regexp ([a-]), or a character class escape being added
 122     // ([a-\s]).  The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.
 123     ASSERT(!((m_charBuffer == -1) && m_isPendingDash));
 124
 125     if (m_charBuffer != -1) {
 126         if (m_isPendingDash) {
 127             // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.
 128             UChar lo = m_charBuffer;
 129             UChar hi = ch;
 130             // Reset back to the inital state.
 131             m_charBuffer = -1;
 132             m_isPendingDash = false;
 133
 134             // This is an error, detected lazily.  Do not proceed.
 135             if (lo > hi) {
 136                 m_isUpsideDown = true;
 137                 return;
 138             }
 139
 140             if (lo <= 0x7f) {
 141                 char asciiLo = lo;
 142                 char asciiHi = std::min(hi, (UChar)0x7f);
 143                 addSortedRange(m_ranges, lo, asciiHi);
 144
 145                 if (m_isCaseInsensitive) {
 146                     if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
 147                         addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
 148                     if ((asciiLo <= 'z') && (asciiHi >= 'a'))
 149                         addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
 150                 }
 151             }
 152             if (hi >= 0x80) {
 153                 UChar unicodeCurr = std::max(lo, (UChar)0x80);
 154                 addSortedRange(m_rangesUnicode, unicodeCurr, hi);
 155
 156                 if (m_isCaseInsensitive) {
 157                     // we're going to scan along, updating the start of the range
 158                     while (unicodeCurr <= hi) {
 159                         // Spin forwards over any characters that don't have two cases.
 160                         for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {
 161                             // if this was the last character in the range, we're done.
 162                             if (unicodeCurr == hi)
 163                                 return;
 164                         }
 165                         // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
 166                         UChar rangeStart = unicodeCurr;
 167                         UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr);
 168
 169                         // If unicodeCurr is not yet hi, check the next char in the range.  If it also has another case,
 170                         // and if it's other case value is one greater then the othercase value for the current last
 171                         // character included in the range, we can include next into the range.
 172                         while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {
 173                             // increment unicodeCurr; it points to the end of the range.
 174                             // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
 175                             ++unicodeCurr;
 176                             ++otherCurr;
 177                         }
 178
 179                         // otherChar is the last in the range of other case chars, calculate offset to get back to the start.
 180                         addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);
 181
 182                         // unicodeCurr has been added, move on to the next char.
 183                         ++unicodeCurr;
 184                     }
 185                 }
 186             }
 187         } else if (ch == '-')
 188             // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.
 189             m_isPendingDash = true;
 190         else {
 191             // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.
 192             flush();
 193             m_charBuffer = ch;
 194         }
 195     } else
 196         // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character
 197         // (the hyphen not treated as a special character in this case, same handling for any char).
 198         m_charBuffer = ch;
 199 }
 200
 201 // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
 202 // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
 203 // array (either ascii or unicode).
 204 // If the pattern is case insensitive we add entries for both cases.
 205 void CharacterClassConstructor::flush()
 206 {
 207     if (m_charBuffer != -1) {
 208         if (m_charBuffer <= 0x7f) {
 209             if (m_isCaseInsensitive && isASCIILower(m_charBuffer))
 210                 addSorted(m_matches, toASCIIUpper(m_charBuffer));
 211             addSorted(m_matches, m_charBuffer);
 212             if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer))
 213                 addSorted(m_matches, toASCIILower(m_charBuffer));
 214         } else {
 215             addSorted(m_matchesUnicode, m_charBuffer);
 216             if (m_isCaseInsensitive) {
 217                 int other = jsc_pcre_ucp_othercase(m_charBuffer);
 218                 if (other != -1)
 219                     addSorted(m_matchesUnicode, other);
 220             }
 221         }
 222         m_charBuffer = -1;
 223     }
 224
 225     if (m_isPendingDash) {
 226         addSorted(m_matches, '-');
 227         m_isPendingDash = false;
 228     }
 229 }
 230
 231 void CharacterClassConstructor::append(const CharacterClass& other)
 232 {
 233     // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
 234     // Need to check the spec, really, but think this matches PCRE behaviour.
 235     flush();
 236
 237     if (other.numMatches) {
 238         for (size_t i = 0; i < other.numMatches; ++i)
 239             addSorted(m_matches, other.matches[i]);
 240     }
 241     if (other.numRanges) {
 242         for (size_t i = 0; i < other.numRanges; ++i)
 243             addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);
 244     }
 245     if (other.numMatchesUnicode) {
 246         for (size_t i = 0; i < other.numMatchesUnicode; ++i)
 247             addSorted(m_matchesUnicode, other.matchesUnicode[i]);
 248     }
 249     if (other.numRangesUnicode) {
 250         for (size_t i = 0; i < other.numRangesUnicode; ++i)
 251             addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);
 252     }
 253 }
 254
 255 } } // namespace JSC::WREC
 256
 257 #endif // ENABLE(WREC)