]>
Commit | Line | Data |
---|---|---|
9dae56ea A |
1 | /* |
2 | * Copyright (C) 2008, 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 | */ | |
25 | ||
26 | #include "config.h" | |
27 | #include "CharacterClassConstructor.h" | |
28 | ||
29 | #if ENABLE(WREC) | |
30 | ||
31 | #include "pcre_internal.h" | |
32 | #include <wtf/ASCIICType.h> | |
33 | ||
34 | using namespace WTF; | |
35 | ||
36 | namespace JSC { namespace WREC { | |
37 | ||
38 | void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch) | |
39 | { | |
40 | unsigned pos = 0; | |
41 | unsigned range = matches.size(); | |
42 | ||
43 | // binary chop, find position to insert char. | |
44 | while (range) { | |
45 | unsigned index = range >> 1; | |
46 | ||
47 | int val = matches[pos+index] - ch; | |
48 | if (!val) | |
49 | return; | |
50 | else if (val > 0) | |
51 | range = index; | |
52 | else { | |
53 | pos += (index+1); | |
54 | range -= (index+1); | |
55 | } | |
56 | } | |
57 | ||
58 | if (pos == matches.size()) | |
59 | matches.append(ch); | |
60 | else | |
61 | matches.insert(pos, ch); | |
62 | } | |
63 | ||
64 | void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) | |
65 | { | |
66 | unsigned end = ranges.size(); | |
67 | ||
68 | // Simple linear scan - I doubt there are that many ranges anyway... | |
69 | // feel free to fix this with something faster (eg binary chop). | |
70 | for (unsigned i = 0; i < end; ++i) { | |
71 | // does the new range fall before the current position in the array | |
72 | if (hi < ranges[i].begin) { | |
73 | // optional optimization: concatenate appending ranges? - may not be worthwhile. | |
74 | if (hi == (ranges[i].begin - 1)) { | |
75 | ranges[i].begin = lo; | |
76 | return; | |
77 | } | |
78 | CharacterRange r = {lo, hi}; | |
79 | ranges.insert(i, r); | |
80 | return; | |
81 | } | |
82 | // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining | |
83 | // If the new range start at or before the end of the last range, then the overlap (if it starts one after the | |
84 | // end of the last range they concatenate, which is just as good. | |
85 | if (lo <= (ranges[i].end + 1)) { | |
86 | // found an intersect! we'll replace this entry in the array. | |
87 | ranges[i].begin = std::min(ranges[i].begin, lo); | |
88 | ranges[i].end = std::max(ranges[i].end, hi); | |
89 | ||
90 | // now check if the new range can subsume any subsequent ranges. | |
91 | unsigned next = i+1; | |
92 | // each iteration of the loop we will either remove something from the list, or break the loop. | |
93 | while (next < ranges.size()) { | |
94 | if (ranges[next].begin <= (ranges[i].end + 1)) { | |
95 | // the next entry now overlaps / concatenates this one. | |
96 | ranges[i].end = std::max(ranges[i].end, ranges[next].end); | |
97 | ranges.remove(next); | |
98 | } else | |
99 | break; | |
100 | } | |
101 | ||
102 | return; | |
103 | } | |
104 | } | |
105 | ||
106 | // CharacterRange comes after all existing ranges. | |
107 | CharacterRange r = {lo, hi}; | |
108 | ranges.append(r); | |
109 | } | |
110 | ||
111 | void CharacterClassConstructor::put(UChar ch) | |
112 | { | |
113 | // Parsing a regular expression like [a-z], we start in an initial empty state: | |
114 | // ((m_charBuffer == -1) && !m_isPendingDash) | |
115 | // When buffer the 'a' sice it may be (and is in this case) part of a range: | |
116 | // ((m_charBuffer != -1) && !m_isPendingDash) | |
117 | // Having parsed the hyphen we then record that the dash is also pending: | |
118 | // ((m_charBuffer != -1) && m_isPendingDash) | |
119 | // The next change will always take us back to the initial state - either because | |
120 | // a complete range has been parsed (such as [a-z]), or because a flush is forced, | |
121 | // due to an early end in the regexp ([a-]), or a character class escape being added | |
122 | // ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted. | |
123 | ASSERT(!((m_charBuffer == -1) && m_isPendingDash)); | |
124 | ||
125 | if (m_charBuffer != -1) { | |
126 | if (m_isPendingDash) { | |
127 | // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range. | |
128 | UChar lo = m_charBuffer; | |
129 | UChar hi = ch; | |
130 | // Reset back to the inital state. | |
131 | m_charBuffer = -1; | |
132 | m_isPendingDash = false; | |
133 | ||
134 | // This is an error, detected lazily. Do not proceed. | |
135 | if (lo > hi) { | |
136 | m_isUpsideDown = true; | |
137 | return; | |
138 | } | |
139 | ||
140 | if (lo <= 0x7f) { | |
141 | char asciiLo = lo; | |
142 | char asciiHi = std::min(hi, (UChar)0x7f); | |
143 | addSortedRange(m_ranges, lo, asciiHi); | |
144 | ||
145 | if (m_isCaseInsensitive) { | |
146 | if ((asciiLo <= 'Z') && (asciiHi >= 'A')) | |
147 | addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A')); | |
148 | if ((asciiLo <= 'z') && (asciiHi >= 'a')) | |
149 | addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a')); | |
150 | } | |
151 | } | |
152 | if (hi >= 0x80) { | |
153 | UChar unicodeCurr = std::max(lo, (UChar)0x80); | |
154 | addSortedRange(m_rangesUnicode, unicodeCurr, hi); | |
155 | ||
156 | if (m_isCaseInsensitive) { | |
157 | // we're going to scan along, updating the start of the range | |
158 | while (unicodeCurr <= hi) { | |
159 | // Spin forwards over any characters that don't have two cases. | |
160 | for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) { | |
161 | // if this was the last character in the range, we're done. | |
162 | if (unicodeCurr == hi) | |
163 | return; | |
164 | } | |
165 | // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case. | |
166 | UChar rangeStart = unicodeCurr; | |
167 | UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr); | |
168 | ||
169 | // If unicodeCurr is not yet hi, check the next char in the range. If it also has another case, | |
170 | // and if it's other case value is one greater then the othercase value for the current last | |
171 | // character included in the range, we can include next into the range. | |
172 | while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) { | |
173 | // increment unicodeCurr; it points to the end of the range. | |
174 | // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value. | |
175 | ++unicodeCurr; | |
176 | ++otherCurr; | |
177 | } | |
178 | ||
179 | // otherChar is the last in the range of other case chars, calculate offset to get back to the start. | |
180 | addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr); | |
181 | ||
182 | // unicodeCurr has been added, move on to the next char. | |
183 | ++unicodeCurr; | |
184 | } | |
185 | } | |
186 | } | |
187 | } else if (ch == '-') | |
188 | // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range. | |
189 | m_isPendingDash = true; | |
190 | else { | |
191 | // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'. | |
192 | flush(); | |
193 | m_charBuffer = ch; | |
194 | } | |
195 | } else | |
196 | // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character | |
197 | // (the hyphen not treated as a special character in this case, same handling for any char). | |
198 | m_charBuffer = ch; | |
199 | } | |
200 | ||
201 | // When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range. | |
202 | // When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate | |
203 | // array (either ascii or unicode). | |
204 | // If the pattern is case insensitive we add entries for both cases. | |
205 | void CharacterClassConstructor::flush() | |
206 | { | |
207 | if (m_charBuffer != -1) { | |
208 | if (m_charBuffer <= 0x7f) { | |
209 | if (m_isCaseInsensitive && isASCIILower(m_charBuffer)) | |
210 | addSorted(m_matches, toASCIIUpper(m_charBuffer)); | |
211 | addSorted(m_matches, m_charBuffer); | |
212 | if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer)) | |
213 | addSorted(m_matches, toASCIILower(m_charBuffer)); | |
214 | } else { | |
215 | addSorted(m_matchesUnicode, m_charBuffer); | |
216 | if (m_isCaseInsensitive) { | |
217 | int other = jsc_pcre_ucp_othercase(m_charBuffer); | |
218 | if (other != -1) | |
219 | addSorted(m_matchesUnicode, other); | |
220 | } | |
221 | } | |
222 | m_charBuffer = -1; | |
223 | } | |
224 | ||
225 | if (m_isPendingDash) { | |
226 | addSorted(m_matches, '-'); | |
227 | m_isPendingDash = false; | |
228 | } | |
229 | } | |
230 | ||
231 | void CharacterClassConstructor::append(const CharacterClass& other) | |
232 | { | |
233 | // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]). | |
234 | // Need to check the spec, really, but think this matches PCRE behaviour. | |
235 | flush(); | |
236 | ||
237 | if (other.numMatches) { | |
238 | for (size_t i = 0; i < other.numMatches; ++i) | |
239 | addSorted(m_matches, other.matches[i]); | |
240 | } | |
241 | if (other.numRanges) { | |
242 | for (size_t i = 0; i < other.numRanges; ++i) | |
243 | addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end); | |
244 | } | |
245 | if (other.numMatchesUnicode) { | |
246 | for (size_t i = 0; i < other.numMatchesUnicode; ++i) | |
247 | addSorted(m_matchesUnicode, other.matchesUnicode[i]); | |
248 | } | |
249 | if (other.numRangesUnicode) { | |
250 | for (size_t i = 0; i < other.numRangesUnicode; ++i) | |
251 | addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end); | |
252 | } | |
253 | } | |
254 | ||
255 | } } // namespace JSC::WREC | |
256 | ||
257 | #endif // ENABLE(WREC) |