]>
git.saurik.com Git - apple/javascriptcore.git/blob - wrec/CharacterClassConstructor.cpp
2 * Copyright (C) 2008, 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "CharacterClassConstructor.h"
31 #include "pcre_internal.h"
32 #include <wtf/ASCIICType.h>
36 namespace JSC
{ namespace WREC
{
38 void CharacterClassConstructor::addSorted(Vector
<UChar
>& matches
, UChar ch
)
41 unsigned range
= matches
.size();
43 // binary chop, find position to insert char.
45 unsigned index
= range
>> 1;
47 int val
= matches
[pos
+index
] - ch
;
58 if (pos
== matches
.size())
61 matches
.insert(pos
, ch
);
64 void CharacterClassConstructor::addSortedRange(Vector
<CharacterRange
>& ranges
, UChar lo
, UChar hi
)
66 unsigned end
= ranges
.size();
68 // Simple linear scan - I doubt there are that many ranges anyway...
69 // feel free to fix this with something faster (eg binary chop).
70 for (unsigned i
= 0; i
< end
; ++i
) {
71 // does the new range fall before the current position in the array
72 if (hi
< ranges
[i
].begin
) {
73 // optional optimization: concatenate appending ranges? - may not be worthwhile.
74 if (hi
== (ranges
[i
].begin
- 1)) {
78 CharacterRange r
= {lo
, hi
};
82 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
83 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
84 // end of the last range they concatenate, which is just as good.
85 if (lo
<= (ranges
[i
].end
+ 1)) {
86 // found an intersect! we'll replace this entry in the array.
87 ranges
[i
].begin
= std::min(ranges
[i
].begin
, lo
);
88 ranges
[i
].end
= std::max(ranges
[i
].end
, hi
);
90 // now check if the new range can subsume any subsequent ranges.
92 // each iteration of the loop we will either remove something from the list, or break the loop.
93 while (next
< ranges
.size()) {
94 if (ranges
[next
].begin
<= (ranges
[i
].end
+ 1)) {
95 // the next entry now overlaps / concatenates this one.
96 ranges
[i
].end
= std::max(ranges
[i
].end
, ranges
[next
].end
);
106 // CharacterRange comes after all existing ranges.
107 CharacterRange r
= {lo
, hi
};
111 void CharacterClassConstructor::put(UChar ch
)
113 // Parsing a regular expression like [a-z], we start in an initial empty state:
114 // ((m_charBuffer == -1) && !m_isPendingDash)
115 // When buffer the 'a' sice it may be (and is in this case) part of a range:
116 // ((m_charBuffer != -1) && !m_isPendingDash)
117 // Having parsed the hyphen we then record that the dash is also pending:
118 // ((m_charBuffer != -1) && m_isPendingDash)
119 // The next change will always take us back to the initial state - either because
120 // a complete range has been parsed (such as [a-z]), or because a flush is forced,
121 // due to an early end in the regexp ([a-]), or a character class escape being added
122 // ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.
123 ASSERT(!((m_charBuffer
== -1) && m_isPendingDash
));
125 if (m_charBuffer
!= -1) {
126 if (m_isPendingDash
) {
127 // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.
128 UChar lo
= m_charBuffer
;
130 // Reset back to the inital state.
132 m_isPendingDash
= false;
134 // This is an error, detected lazily. Do not proceed.
136 m_isUpsideDown
= true;
142 char asciiHi
= std::min(hi
, (UChar
)0x7f);
143 addSortedRange(m_ranges
, lo
, asciiHi
);
145 if (m_isCaseInsensitive
) {
146 if ((asciiLo
<= 'Z') && (asciiHi
>= 'A'))
147 addSortedRange(m_ranges
, std::max(asciiLo
, 'A')+('a'-'A'), std::min(asciiHi
, 'Z')+('a'-'A'));
148 if ((asciiLo
<= 'z') && (asciiHi
>= 'a'))
149 addSortedRange(m_ranges
, std::max(asciiLo
, 'a')+('A'-'a'), std::min(asciiHi
, 'z')+('A'-'a'));
153 UChar unicodeCurr
= std::max(lo
, (UChar
)0x80);
154 addSortedRange(m_rangesUnicode
, unicodeCurr
, hi
);
156 if (m_isCaseInsensitive
) {
157 // we're going to scan along, updating the start of the range
158 while (unicodeCurr
<= hi
) {
159 // Spin forwards over any characters that don't have two cases.
160 for (; jsc_pcre_ucp_othercase(unicodeCurr
) == -1; ++unicodeCurr
) {
161 // if this was the last character in the range, we're done.
162 if (unicodeCurr
== hi
)
165 // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
166 UChar rangeStart
= unicodeCurr
;
167 UChar otherCurr
= jsc_pcre_ucp_othercase(unicodeCurr
);
169 // If unicodeCurr is not yet hi, check the next char in the range. If it also has another case,
170 // and if it's other case value is one greater then the othercase value for the current last
171 // character included in the range, we can include next into the range.
172 while ((unicodeCurr
< hi
) && (jsc_pcre_ucp_othercase(unicodeCurr
+ 1) == (otherCurr
+ 1))) {
173 // increment unicodeCurr; it points to the end of the range.
174 // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
179 // otherChar is the last in the range of other case chars, calculate offset to get back to the start.
180 addSortedRange(m_rangesUnicode
, otherCurr
-(unicodeCurr
-rangeStart
), otherCurr
);
182 // unicodeCurr has been added, move on to the next char.
187 } else if (ch
== '-')
188 // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.
189 m_isPendingDash
= true;
191 // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.
196 // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character
197 // (the hyphen not treated as a special character in this case, same handling for any char).
201 // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
202 // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
203 // array (either ascii or unicode).
204 // If the pattern is case insensitive we add entries for both cases.
205 void CharacterClassConstructor::flush()
207 if (m_charBuffer
!= -1) {
208 if (m_charBuffer
<= 0x7f) {
209 if (m_isCaseInsensitive
&& isASCIILower(m_charBuffer
))
210 addSorted(m_matches
, toASCIIUpper(m_charBuffer
));
211 addSorted(m_matches
, m_charBuffer
);
212 if (m_isCaseInsensitive
&& isASCIIUpper(m_charBuffer
))
213 addSorted(m_matches
, toASCIILower(m_charBuffer
));
215 addSorted(m_matchesUnicode
, m_charBuffer
);
216 if (m_isCaseInsensitive
) {
217 int other
= jsc_pcre_ucp_othercase(m_charBuffer
);
219 addSorted(m_matchesUnicode
, other
);
225 if (m_isPendingDash
) {
226 addSorted(m_matches
, '-');
227 m_isPendingDash
= false;
231 void CharacterClassConstructor::append(const CharacterClass
& other
)
233 // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
234 // Need to check the spec, really, but think this matches PCRE behaviour.
237 if (other
.numMatches
) {
238 for (size_t i
= 0; i
< other
.numMatches
; ++i
)
239 addSorted(m_matches
, other
.matches
[i
]);
241 if (other
.numRanges
) {
242 for (size_t i
= 0; i
< other
.numRanges
; ++i
)
243 addSortedRange(m_ranges
, other
.ranges
[i
].begin
, other
.ranges
[i
].end
);
245 if (other
.numMatchesUnicode
) {
246 for (size_t i
= 0; i
< other
.numMatchesUnicode
; ++i
)
247 addSorted(m_matchesUnicode
, other
.matchesUnicode
[i
]);
249 if (other
.numRangesUnicode
) {
250 for (size_t i
= 0; i
< other
.numRangesUnicode
; ++i
)
251 addSortedRange(m_rangesUnicode
, other
.rangesUnicode
[i
].begin
, other
.rangesUnicode
[i
].end
);
255 } } // namespace JSC::WREC
257 #endif // ENABLE(WREC)