]>
git.saurik.com Git - apple/javascriptcore.git/blob - wrec/CharacterClassConstructor.cpp
06f42628d945b7dc345786aabf51f472a5783dbd
   2  * Copyright (C) 2008, 2009 Apple Inc. All rights reserved. 
   4  * Redistribution and use in source and binary forms, with or without 
   5  * modification, are permitted provided that the following conditions 
   7  * 1. Redistributions of source code must retain the above copyright 
   8  *    notice, this list of conditions and the following disclaimer. 
   9  * 2. Redistributions in binary form must reproduce the above copyright 
  10  *    notice, this list of conditions and the following disclaimer in the 
  11  *    documentation and/or other materials provided with the distribution. 
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR 
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
  27 #include "CharacterClassConstructor.h" 
  31 #include "pcre_internal.h" 
  32 #include <wtf/ASCIICType.h> 
  36 namespace JSC 
{ namespace WREC 
{ 
  38 void CharacterClassConstructor::addSorted(Vector
<UChar
>& matches
, UChar ch
) 
  41     unsigned range 
= matches
.size(); 
  43     // binary chop, find position to insert char. 
  45         unsigned index 
= range 
>> 1; 
  47         int val 
= matches
[pos
+index
] - ch
; 
  58     if (pos 
== matches
.size()) 
  61         matches
.insert(pos
, ch
); 
  64 void CharacterClassConstructor::addSortedRange(Vector
<CharacterRange
>& ranges
, UChar lo
, UChar hi
) 
  66     unsigned end 
= ranges
.size(); 
  68     // Simple linear scan - I doubt there are that many ranges anyway... 
  69     // feel free to fix this with something faster (eg binary chop). 
  70     for (unsigned i 
= 0; i 
< end
; ++i
) { 
  71         // does the new range fall before the current position in the array 
  72         if (hi 
< ranges
[i
].begin
) { 
  73             // optional optimization: concatenate appending ranges? - may not be worthwhile. 
  74             if (hi 
== (ranges
[i
].begin 
- 1)) { 
  78             CharacterRange r 
= {lo
, hi
}; 
  82         // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining 
  83         // If the new range start at or before the end of the last range, then the overlap (if it starts one after the 
  84         // end of the last range they concatenate, which is just as good. 
  85         if (lo 
<= (ranges
[i
].end 
+ 1)) { 
  86             // found an intersect! we'll replace this entry in the array. 
  87             ranges
[i
].begin 
= std::min(ranges
[i
].begin
, lo
); 
  88             ranges
[i
].end 
= std::max(ranges
[i
].end
, hi
); 
  90             // now check if the new range can subsume any subsequent ranges. 
  92             // each iteration of the loop we will either remove something from the list, or break the loop. 
  93             while (next 
< ranges
.size()) { 
  94                 if (ranges
[next
].begin 
<= (ranges
[i
].end 
+ 1)) { 
  95                     // the next entry now overlaps / concatenates this one. 
  96                     ranges
[i
].end 
= std::max(ranges
[i
].end
, ranges
[next
].end
); 
 106     // CharacterRange comes after all existing ranges. 
 107     CharacterRange r 
= {lo
, hi
}; 
 111 void CharacterClassConstructor::put(UChar ch
) 
 113     // Parsing a regular expression like [a-z], we start in an initial empty state: 
 114     //     ((m_charBuffer == -1) && !m_isPendingDash) 
 115     // When buffer the 'a' sice it may be (and is in this case) part of a range: 
 116     //     ((m_charBuffer != -1) && !m_isPendingDash) 
 117     // Having parsed the hyphen we then record that the dash is also pending: 
 118     //     ((m_charBuffer != -1) && m_isPendingDash) 
 119     // The next change will always take us back to the initial state - either because 
 120     // a complete range has been parsed (such as [a-z]), or because a flush is forced, 
 121     // due to an early end in the regexp ([a-]), or a character class escape being added 
 122     // ([a-\s]).  The fourth permutation of m_charBuffer and m_isPendingDash is not permitted. 
 123     ASSERT(!((m_charBuffer 
== -1) && m_isPendingDash
)); 
 125     if (m_charBuffer 
!= -1) { 
 126         if (m_isPendingDash
) { 
 127             // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range. 
 128             UChar lo 
= m_charBuffer
; 
 130             // Reset back to the inital state. 
 132             m_isPendingDash 
= false; 
 134             // This is an error, detected lazily.  Do not proceed. 
 136                 m_isUpsideDown 
= true; 
 142                 char asciiHi 
= std::min(hi
, (UChar
)0x7f); 
 143                 addSortedRange(m_ranges
, lo
, asciiHi
); 
 145                 if (m_isCaseInsensitive
) { 
 146                     if ((asciiLo 
<= 'Z') && (asciiHi 
>= 'A')) 
 147                         addSortedRange(m_ranges
, std::max(asciiLo
, 'A')+('a'-'A'), std::min(asciiHi
, 'Z')+('a'-'A')); 
 148                     if ((asciiLo 
<= 'z') && (asciiHi 
>= 'a')) 
 149                         addSortedRange(m_ranges
, std::max(asciiLo
, 'a')+('A'-'a'), std::min(asciiHi
, 'z')+('A'-'a')); 
 153                 UChar unicodeCurr 
= std::max(lo
, (UChar
)0x80); 
 154                 addSortedRange(m_rangesUnicode
, unicodeCurr
, hi
); 
 156                 if (m_isCaseInsensitive
) { 
 157                     // we're going to scan along, updating the start of the range 
 158                     while (unicodeCurr 
<= hi
) { 
 159                         // Spin forwards over any characters that don't have two cases. 
 160                         for (; jsc_pcre_ucp_othercase(unicodeCurr
) == -1; ++unicodeCurr
) { 
 161                             // if this was the last character in the range, we're done. 
 162                             if (unicodeCurr 
== hi
) 
 165                         // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case. 
 166                         UChar rangeStart 
= unicodeCurr
; 
 167                         UChar otherCurr 
= jsc_pcre_ucp_othercase(unicodeCurr
); 
 169                         // If unicodeCurr is not yet hi, check the next char in the range.  If it also has another case, 
 170                         // and if it's other case value is one greater then the othercase value for the current last 
 171                         // character included in the range, we can include next into the range. 
 172                         while ((unicodeCurr 
< hi
) && (jsc_pcre_ucp_othercase(unicodeCurr 
+ 1) == (otherCurr 
+ 1))) { 
 173                             // increment unicodeCurr; it points to the end of the range. 
 174                             // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value. 
 179                         // otherChar is the last in the range of other case chars, calculate offset to get back to the start. 
 180                         addSortedRange(m_rangesUnicode
, otherCurr
-(unicodeCurr
-rangeStart
), otherCurr
); 
 182                         // unicodeCurr has been added, move on to the next char. 
 187         } else if (ch 
== '-') 
 188             // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range. 
 189             m_isPendingDash 
= true; 
 191             // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'. 
 196         // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character 
 197         // (the hyphen not treated as a special character in this case, same handling for any char). 
 201 // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range. 
 202 // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate 
 203 // array (either ascii or unicode). 
 204 // If the pattern is case insensitive we add entries for both cases. 
 205 void CharacterClassConstructor::flush() 
 207     if (m_charBuffer 
!= -1) { 
 208         if (m_charBuffer 
<= 0x7f) { 
 209             if (m_isCaseInsensitive 
&& isASCIILower(m_charBuffer
)) 
 210                 addSorted(m_matches
, toASCIIUpper(m_charBuffer
)); 
 211             addSorted(m_matches
, m_charBuffer
); 
 212             if (m_isCaseInsensitive 
&& isASCIIUpper(m_charBuffer
)) 
 213                 addSorted(m_matches
, toASCIILower(m_charBuffer
)); 
 215             addSorted(m_matchesUnicode
, m_charBuffer
); 
 216             if (m_isCaseInsensitive
) { 
 217                 int other 
= jsc_pcre_ucp_othercase(m_charBuffer
); 
 219                     addSorted(m_matchesUnicode
, other
); 
 225     if (m_isPendingDash
) { 
 226         addSorted(m_matches
, '-'); 
 227         m_isPendingDash 
= false; 
 231 void CharacterClassConstructor::append(const CharacterClass
& other
) 
 233     // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]). 
 234     // Need to check the spec, really, but think this matches PCRE behaviour. 
 237     if (other
.numMatches
) { 
 238         for (size_t i 
= 0; i 
< other
.numMatches
; ++i
) 
 239             addSorted(m_matches
, other
.matches
[i
]); 
 241     if (other
.numRanges
) { 
 242         for (size_t i 
= 0; i 
< other
.numRanges
; ++i
) 
 243             addSortedRange(m_ranges
, other
.ranges
[i
].begin
, other
.ranges
[i
].end
); 
 245     if (other
.numMatchesUnicode
) { 
 246         for (size_t i 
= 0; i 
< other
.numMatchesUnicode
; ++i
) 
 247             addSorted(m_matchesUnicode
, other
.matchesUnicode
[i
]); 
 249     if (other
.numRangesUnicode
) { 
 250         for (size_t i 
= 0; i 
< other
.numRangesUnicode
; ++i
) 
 251             addSortedRange(m_rangesUnicode
, other
.rangesUnicode
[i
].begin
, other
.rangesUnicode
[i
].end
); 
 255 } } // namespace JSC::WREC 
 257 #endif // ENABLE(WREC)