[apple/javascriptcore.git] / wrec / CharacterClassConstructor.cpp

/*
 * Copyright (C) 2008, 2009 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
#include "CharacterClassConstructor.h"

#if ENABLE(WREC)

#include "pcre_internal.h"
#include <wtf/ASCIICType.h>

using namespace WTF;

namespace JSC { namespace WREC {

void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)
{
    unsigned pos = 0;
    unsigned range = matches.size();

    // binary chop, find position to insert char.
    while (range) {
        unsigned index = range >> 1;

        int val = matches[pos+index] - ch;
        if (!val)
            return;
        else if (val > 0)
            range = index;
        else {
            pos += (index+1);
            range -= (index+1);
        }
    }
    
    if (pos == matches.size())
        matches.append(ch);
    else
        matches.insert(pos, ch);
}

void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
{
    unsigned end = ranges.size();
    
    // Simple linear scan - I doubt there are that many ranges anyway...
    // feel free to fix this with something faster (eg binary chop).
    for (unsigned i = 0; i < end; ++i) {
        // does the new range fall before the current position in the array
        if (hi < ranges[i].begin) {
            // optional optimization: concatenate appending ranges? - may not be worthwhile.
            if (hi == (ranges[i].begin - 1)) {
                ranges[i].begin = lo;
                return;
            }
            CharacterRange r = {lo, hi};
            ranges.insert(i, r);
            return;
        }
        // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
        // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
        // end of the last range they concatenate, which is just as good.
        if (lo <= (ranges[i].end + 1)) {
            // found an intersect! we'll replace this entry in the array.
            ranges[i].begin = std::min(ranges[i].begin, lo);
            ranges[i].end = std::max(ranges[i].end, hi);

            // now check if the new range can subsume any subsequent ranges.
            unsigned next = i+1;
            // each iteration of the loop we will either remove something from the list, or break the loop.
            while (next < ranges.size()) {
                if (ranges[next].begin <= (ranges[i].end + 1)) {
                    // the next entry now overlaps / concatenates this one.
                    ranges[i].end = std::max(ranges[i].end, ranges[next].end);
                    ranges.remove(next);
                } else
                    break;
            }
            
            return;
        }
    }

    // CharacterRange comes after all existing ranges.
    CharacterRange r = {lo, hi};
    ranges.append(r);
}

void CharacterClassConstructor::put(UChar ch)
{
    // Parsing a regular expression like [a-z], we start in an initial empty state:
    //     ((m_charBuffer == -1) && !m_isPendingDash)
    // When buffer the 'a' sice it may be (and is in this case) part of a range:
    //     ((m_charBuffer != -1) && !m_isPendingDash)
    // Having parsed the hyphen we then record that the dash is also pending:
    //     ((m_charBuffer != -1) && m_isPendingDash)
    // The next change will always take us back to the initial state - either because
    // a complete range has been parsed (such as [a-z]), or because a flush is forced,
    // due to an early end in the regexp ([a-]), or a character class escape being added
    // ([a-\s]).  The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.
    ASSERT(!((m_charBuffer == -1) && m_isPendingDash));

    if (m_charBuffer != -1) {
        if (m_isPendingDash) {
            // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.
            UChar lo = m_charBuffer;
            UChar hi = ch;
            // Reset back to the inital state.
            m_charBuffer = -1;
            m_isPendingDash = false;
            
            // This is an error, detected lazily.  Do not proceed.
            if (lo > hi) {
                m_isUpsideDown = true;
                return;
            }
            
            if (lo <= 0x7f) {
                char asciiLo = lo;
                char asciiHi = std::min(hi, (UChar)0x7f);
                addSortedRange(m_ranges, lo, asciiHi);
                
                if (m_isCaseInsensitive) {
                    if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
                        addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
                    if ((asciiLo <= 'z') && (asciiHi >= 'a'))
                        addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
                }
            }
            if (hi >= 0x80) {
                UChar unicodeCurr = std::max(lo, (UChar)0x80);
                addSortedRange(m_rangesUnicode, unicodeCurr, hi);
                
                if (m_isCaseInsensitive) {
                    // we're going to scan along, updating the start of the range
                    while (unicodeCurr <= hi) {
                        // Spin forwards over any characters that don't have two cases.
                        for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {
                            // if this was the last character in the range, we're done.
                            if (unicodeCurr == hi)
                                return;
                        }
                        // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
                        UChar rangeStart = unicodeCurr;
                        UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr);
                        
                        // If unicodeCurr is not yet hi, check the next char in the range.  If it also has another case,
                        // and if it's other case value is one greater then the othercase value for the current last
                        // character included in the range, we can include next into the range.
                        while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {
                            // increment unicodeCurr; it points to the end of the range.
                            // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
                            ++unicodeCurr;
                            ++otherCurr;
                        }
                        
                        // otherChar is the last in the range of other case chars, calculate offset to get back to the start.
                        addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);
                        
                        // unicodeCurr has been added, move on to the next char.
                        ++unicodeCurr;
                    }
                }
            }
        } else if (ch == '-')
            // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.
            m_isPendingDash = true;
        else {
            // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.
            flush();
            m_charBuffer = ch;
        }
    } else
        // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character
        // (the hyphen not treated as a special character in this case, same handling for any char).
        m_charBuffer = ch;
}

// When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
// When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
// array (either ascii or unicode).
// If the pattern is case insensitive we add entries for both cases.
void CharacterClassConstructor::flush()
{
    if (m_charBuffer != -1) {
        if (m_charBuffer <= 0x7f) {
            if (m_isCaseInsensitive && isASCIILower(m_charBuffer))
                addSorted(m_matches, toASCIIUpper(m_charBuffer));
            addSorted(m_matches, m_charBuffer);
            if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer))
                addSorted(m_matches, toASCIILower(m_charBuffer));
        } else {
            addSorted(m_matchesUnicode, m_charBuffer);
            if (m_isCaseInsensitive) {
                int other = jsc_pcre_ucp_othercase(m_charBuffer);
                if (other != -1)
                    addSorted(m_matchesUnicode, other);
            }
        }
        m_charBuffer = -1;
    }
    
    if (m_isPendingDash) {
        addSorted(m_matches, '-');
        m_isPendingDash = false;
    }
}

void CharacterClassConstructor::append(const CharacterClass& other)
{
    // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
    // Need to check the spec, really, but think this matches PCRE behaviour.
    flush();
    
    if (other.numMatches) {
        for (size_t i = 0; i < other.numMatches; ++i)
            addSorted(m_matches, other.matches[i]);
    }
    if (other.numRanges) {
        for (size_t i = 0; i < other.numRanges; ++i)
            addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);
    }
    if (other.numMatchesUnicode) {
        for (size_t i = 0; i < other.numMatchesUnicode; ++i)
            addSorted(m_matchesUnicode, other.matchesUnicode[i]);
    }
    if (other.numRangesUnicode) {
        for (size_t i = 0; i < other.numRangesUnicode; ++i)
            addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);
    }
}

} } // namespace JSC::WREC

#endif // ENABLE(WREC)
Commit	Line	Data
9dae56ea A	1	/*
	2	* Copyright (C) 2008, 2009 Apple Inc. All rights reserved.
	3	*
	4	* Redistribution and use in source and binary forms, with or without
	5	* modification, are permitted provided that the following conditions
	6	* are met:
	7	* 1. Redistributions of source code must retain the above copyright
	8	* notice, this list of conditions and the following disclaimer.
	9	* 2. Redistributions in binary form must reproduce the above copyright
	10	* notice, this list of conditions and the following disclaimer in the
	11	* documentation and/or other materials provided with the distribution.
	12	*
	13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
	14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
	17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	24	*/
	25
	26	#include "config.h"
	27	#include "CharacterClassConstructor.h"
	28
	29	#if ENABLE(WREC)
	30
	31	#include "pcre_internal.h"
	32	#include <wtf/ASCIICType.h>
	33
	34	using namespace WTF;
	35
	36	namespace JSC { namespace WREC {
	37
	38	void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch)
	39	{
	40	unsigned pos = 0;
	41	unsigned range = matches.size();
	42
	43	// binary chop, find position to insert char.
	44	while (range) {
	45	unsigned index = range >> 1;
	46
	47	int val = matches[pos+index] - ch;
	48	if (!val)
	49	return;
	50	else if (val > 0)
	51	range = index;
	52	else {
	53	pos += (index+1);
	54	range -= (index+1);
	55	}
	56	}
	57
	58	if (pos == matches.size())
	59	matches.append(ch);
	60	else
	61	matches.insert(pos, ch);
	62	}
	63
	64	void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
65	{
66	unsigned end = ranges.size();
67
68	// Simple linear scan - I doubt there are that many ranges anyway...
69	// feel free to fix this with something faster (eg binary chop).
70	for (unsigned i = 0; i < end; ++i) {
71	// does the new range fall before the current position in the array
72	if (hi < ranges[i].begin) {
73	// optional optimization: concatenate appending ranges? - may not be worthwhile.
74	if (hi == (ranges[i].begin - 1)) {
75	ranges[i].begin = lo;
76	return;
77	}
78	CharacterRange r = {lo, hi};
79	ranges.insert(i, r);
80	return;
81	}
82	// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
83	// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
84	// end of the last range they concatenate, which is just as good.
85	if (lo <= (ranges[i].end + 1)) {
86	// found an intersect! we'll replace this entry in the array.
87	ranges[i].begin = std::min(ranges[i].begin, lo);
88	ranges[i].end = std::max(ranges[i].end, hi);
89
90	// now check if the new range can subsume any subsequent ranges.
91	unsigned next = i+1;
92	// each iteration of the loop we will either remove something from the list, or break the loop.
93	while (next < ranges.size()) {
94	if (ranges[next].begin <= (ranges[i].end + 1)) {
95	// the next entry now overlaps / concatenates this one.
96	ranges[i].end = std::max(ranges[i].end, ranges[next].end);
97	ranges.remove(next);
98	} else
99	break;
100	}
101
102	return;
103	}
104	}
105
106	// CharacterRange comes after all existing ranges.
107	CharacterRange r = {lo, hi};
108	ranges.append(r);
109	}
110
111	void CharacterClassConstructor::put(UChar ch)
112	{
113	// Parsing a regular expression like [a-z], we start in an initial empty state:
114	// ((m_charBuffer == -1) && !m_isPendingDash)
115	// When buffer the 'a' sice it may be (and is in this case) part of a range:
116	// ((m_charBuffer != -1) && !m_isPendingDash)
117	// Having parsed the hyphen we then record that the dash is also pending:
118	// ((m_charBuffer != -1) && m_isPendingDash)
119	// The next change will always take us back to the initial state - either because
120	// a complete range has been parsed (such as [a-z]), or because a flush is forced,
121	// due to an early end in the regexp ([a-]), or a character class escape being added
122	// ([a-\s]). The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.
123	ASSERT(!((m_charBuffer == -1) && m_isPendingDash));
124
125	if (m_charBuffer != -1) {
126	if (m_isPendingDash) {
127	// EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.
128	UChar lo = m_charBuffer;
129	UChar hi = ch;
130	// Reset back to the inital state.
131	m_charBuffer = -1;
132	m_isPendingDash = false;
133
134	// This is an error, detected lazily. Do not proceed.
135	if (lo > hi) {
136	m_isUpsideDown = true;
137	return;
138	}
139
140	if (lo <= 0x7f) {
141	char asciiLo = lo;
142	char asciiHi = std::min(hi, (UChar)0x7f);
143	addSortedRange(m_ranges, lo, asciiHi);
144
145	if (m_isCaseInsensitive) {
146	if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
147	addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
148	if ((asciiLo <= 'z') && (asciiHi >= 'a'))
149	addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
150	}
151	}
152	if (hi >= 0x80) {
153	UChar unicodeCurr = std::max(lo, (UChar)0x80);
154	addSortedRange(m_rangesUnicode, unicodeCurr, hi);
155
156	if (m_isCaseInsensitive) {
157	// we're going to scan along, updating the start of the range
158	while (unicodeCurr <= hi) {
159	// Spin forwards over any characters that don't have two cases.
160	for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {
161	// if this was the last character in the range, we're done.
162	if (unicodeCurr == hi)
163	return;
164	}
165	// if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.
166	UChar rangeStart = unicodeCurr;
167	UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr);
168
169	// If unicodeCurr is not yet hi, check the next char in the range. If it also has another case,
170	// and if it's other case value is one greater then the othercase value for the current last
171	// character included in the range, we can include next into the range.
172	while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {
173	// increment unicodeCurr; it points to the end of the range.
174	// increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.
175	++unicodeCurr;
176	++otherCurr;
177	}
178
179	// otherChar is the last in the range of other case chars, calculate offset to get back to the start.
180	addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);
181
182	// unicodeCurr has been added, move on to the next char.
183	++unicodeCurr;
184	}
185	}
186	}
187	} else if (ch == '-')
188	// EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.
189	m_isPendingDash = true;
190	else {
191	// EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.
192	flush();
193	m_charBuffer = ch;
194	}
195	} else
196	// EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character
197	// (the hyphen not treated as a special character in this case, same handling for any char).
198	m_charBuffer = ch;
199	}
200
201	// When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.
202	// When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate
203	// array (either ascii or unicode).
204	// If the pattern is case insensitive we add entries for both cases.
205	void CharacterClassConstructor::flush()
206	{
207	if (m_charBuffer != -1) {
208	if (m_charBuffer <= 0x7f) {
209	if (m_isCaseInsensitive && isASCIILower(m_charBuffer))
210	addSorted(m_matches, toASCIIUpper(m_charBuffer));
211	addSorted(m_matches, m_charBuffer);
212	if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer))
213	addSorted(m_matches, toASCIILower(m_charBuffer));
214	} else {
215	addSorted(m_matchesUnicode, m_charBuffer);
216	if (m_isCaseInsensitive) {
217	int other = jsc_pcre_ucp_othercase(m_charBuffer);
218	if (other != -1)
219	addSorted(m_matchesUnicode, other);
220	}
221	}
222	m_charBuffer = -1;
223	}
224
225	if (m_isPendingDash) {
226	addSorted(m_matches, '-');
227	m_isPendingDash = false;
228	}
229	}
230
231	void CharacterClassConstructor::append(const CharacterClass& other)
232	{
233	// [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).
234	// Need to check the spec, really, but think this matches PCRE behaviour.
235	flush();
236
237	if (other.numMatches) {
238	for (size_t i = 0; i < other.numMatches; ++i)
239	addSorted(m_matches, other.matches[i]);
240	}
241	if (other.numRanges) {
242	for (size_t i = 0; i < other.numRanges; ++i)
243	addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);
244	}
245	if (other.numMatchesUnicode) {
246	for (size_t i = 0; i < other.numMatchesUnicode; ++i)
247	addSorted(m_matchesUnicode, other.matchesUnicode[i]);
248	}
249	if (other.numRangesUnicode) {
250	for (size_t i = 0; i < other.numRangesUnicode; ++i)
251	addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);
252	}
253	}
254
255	} } // namespace JSC::WREC
256
257	#endif // ENABLE(WREC)