git.saurik.com Git - apple/javascriptcore.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (C) 2009 Apple Inc. All rights reserved.
	3	*
	4	* Redistribution and use in source and binary forms, with or without
	5	* modification, are permitted provided that the following conditions
	6	* are met:
	7	* 1. Redistributions of source code must retain the above copyright
	8	* notice, this list of conditions and the following disclaimer.
	9	* 2. Redistributions in binary form must reproduce the above copyright
	10	* notice, this list of conditions and the following disclaimer in the
	11	* documentation and/or other materials provided with the distribution.
	12	*
	13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
	14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
	17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	24	*/
	25
	26	#include "config.h"
	27	#include "RegexCompiler.h"
	28
	29	#include "RegexInterpreter.h"
	30	#include "RegexPattern.h"
	31	#include <wtf/Vector.h>
	32
	33	#if ENABLE(YARR)
	34
	35	using namespace WTF;
	36
	37	namespace JSC { namespace Yarr {
	38
	39	class CharacterClassConstructor {
	40	public:
	41	CharacterClassConstructor(bool isCaseInsensitive = false)
	42	: m_isCaseInsensitive(isCaseInsensitive)
	43	{
	44	}
	45
	46	void reset()
	47	{
	48	m_matches.clear();
	49	m_ranges.clear();
	50	m_matchesUnicode.clear();
	51	m_rangesUnicode.clear();
	52	}
	53
	54	void append(const CharacterClass* other)
	55	{
	56	for (size_t i = 0; i < other->m_matches.size(); ++i)
	57	addSorted(m_matches, other->m_matches[i]);
	58	for (size_t i = 0; i < other->m_ranges.size(); ++i)
	59	addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
	60	for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
	61	addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
	62	for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
	63	addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
	64	}
	65
	66	void putChar(UChar ch)
	67	{
	68	if (ch <= 0x7f) {
	69	if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
	70	addSorted(m_matches, toASCIIUpper(ch));
	71	addSorted(m_matches, toASCIILower(ch));
	72	} else
	73	addSorted(m_matches, ch);
	74	} else {
	75	UChar upper, lower;
	76	if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
	77	addSorted(m_matchesUnicode, upper);
	78	addSorted(m_matchesUnicode, lower);
	79	} else
	80	addSorted(m_matchesUnicode, ch);
	81	}
	82	}
	83
	84	// returns true if this character has another case, and 'ch' is the upper case form.
	85	static inline bool isUnicodeUpper(UChar ch)
	86	{
	87	return ch != Unicode::toLower(ch);
	88	}
	89
	90	// returns true if this character has another case, and 'ch' is the lower case form.
	91	static inline bool isUnicodeLower(UChar ch)
	92	{
	93	return ch != Unicode::toUpper(ch);
	94	}
	95
	96	void putRange(UChar lo, UChar hi)
	97	{
	98	if (lo <= 0x7f) {
	99	char asciiLo = lo;
	100	char asciiHi = std::min(hi, (UChar)0x7f);
	101	addSortedRange(m_ranges, lo, asciiHi);
	102
	103	if (m_isCaseInsensitive) {
	104	if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
	105	addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
	106	if ((asciiLo <= 'z') && (asciiHi >= 'a'))
	107	addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
	108	}
	109	}
	110	if (hi >= 0x80) {
	111	uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
	112	addSortedRange(m_rangesUnicode, unicodeCurr, hi);
	113
	114	if (m_isCaseInsensitive) {
	115	while (unicodeCurr <= hi) {
	116	// If the upper bound of the range (hi) is 0xffff, the increments to
	117	// unicodeCurr in this loop may take it to 0x10000. This is fine
	118	// (if so we won't re-enter the loop, since the loop condition above
	119	// will definitely fail) - but this does mean we cannot use a UChar
	120	// to represent unicodeCurr, we must use a 32-bit value instead.
	121	ASSERT(unicodeCurr <= 0xffff);
	122
	123	if (isUnicodeUpper(unicodeCurr)) {
	124	UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
	125	UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
	126	while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
	127	lowerCaseRangeEnd++;
	128	addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
	129	} else if (isUnicodeLower(unicodeCurr)) {
	130	UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
	131	UChar upperCaseRangeEnd = upperCaseRangeBegin;
	132	while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
	133	upperCaseRangeEnd++;
	134	addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
	135	} else
	136	++unicodeCurr;
	137	}
	138	}
	139	}
	140	}
	141
	142	CharacterClass* charClass()
	143	{
	144	CharacterClass* characterClass = new CharacterClass();
	145
	146	characterClass->m_matches.append(m_matches);
	147	characterClass->m_ranges.append(m_ranges);
	148	characterClass->m_matchesUnicode.append(m_matchesUnicode);
	149	characterClass->m_rangesUnicode.append(m_rangesUnicode);
	150
	151	reset();
	152
	153	return characterClass;
	154	}
	155
	156	private:
	157	void addSorted(Vector<UChar>& matches, UChar ch)
	158	{
	159	unsigned pos = 0;
	160	unsigned range = matches.size();
	161
	162	// binary chop, find position to insert char.
	163	while (range) {
	164	unsigned index = range >> 1;
	165
	166	int val = matches[pos+index] - ch;
	167	if (!val)
	168	return;
	169	else if (val > 0)
	170	range = index;
	171	else {
	172	pos += (index+1);
	173	range -= (index+1);
	174	}
	175	}
	176
	177	if (pos == matches.size())
	178	matches.append(ch);
	179	else
	180	matches.insert(pos, ch);
	181	}
	182
	183	void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
	184	{
	185	unsigned end = ranges.size();
	186
	187	// Simple linear scan - I doubt there are that many ranges anyway...
	188	// feel free to fix this with something faster (eg binary chop).
	189	for (unsigned i = 0; i < end; ++i) {
	190	// does the new range fall before the current position in the array
	191	if (hi < ranges[i].begin) {
	192	// optional optimization: concatenate appending ranges? - may not be worthwhile.
	193	if (hi == (ranges[i].begin - 1)) {
	194	ranges[i].begin = lo;
	195	return;
	196	}
	197	ranges.insert(i, CharacterRange(lo, hi));
	198	return;
	199	}
	200	// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
	201	// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
	202	// end of the last range they concatenate, which is just as good.
	203	if (lo <= (ranges[i].end + 1)) {
	204	// found an intersect! we'll replace this entry in the array.
	205	ranges[i].begin = std::min(ranges[i].begin, lo);
	206	ranges[i].end = std::max(ranges[i].end, hi);
	207
	208	// now check if the new range can subsume any subsequent ranges.
	209	unsigned next = i+1;
	210	// each iteration of the loop we will either remove something from the list, or break the loop.
	211	while (next < ranges.size()) {
	212	if (ranges[next].begin <= (ranges[i].end + 1)) {
	213	// the next entry now overlaps / concatenates this one.
	214	ranges[i].end = std::max(ranges[i].end, ranges[next].end);
	215	ranges.remove(next);
	216	} else
	217	break;
	218	}
	219
	220	return;
	221	}
	222	}
	223
	224	// CharacterRange comes after all existing ranges.
	225	ranges.append(CharacterRange(lo, hi));
	226	}
	227
	228	bool m_isCaseInsensitive;
	229
	230	Vector<UChar> m_matches;
	231	Vector<CharacterRange> m_ranges;
	232	Vector<UChar> m_matchesUnicode;
	233	Vector<CharacterRange> m_rangesUnicode;
	234	};
	235
	236
	237	CharacterClass* newlineCreate()
	238	{
	239	CharacterClass* characterClass = new CharacterClass();
	240
	241	characterClass->m_matches.append('\n');
	242	characterClass->m_matches.append('\r');
	243	characterClass->m_matchesUnicode.append(0x2028);
	244	characterClass->m_matchesUnicode.append(0x2029);
	245
	246	return characterClass;
	247	}
	248
	249	CharacterClass* digitsCreate()
	250	{
	251	CharacterClass* characterClass = new CharacterClass();
	252
	253	characterClass->m_ranges.append(CharacterRange('0', '9'));
	254
	255	return characterClass;
	256	}
	257
	258	CharacterClass* spacesCreate()
	259	{
	260	CharacterClass* characterClass = new CharacterClass();
	261
	262	characterClass->m_matches.append(' ');
	263	characterClass->m_ranges.append(CharacterRange('\t', '\r'));
	264	characterClass->m_matchesUnicode.append(0x00a0);
	265	characterClass->m_matchesUnicode.append(0x1680);
	266	characterClass->m_matchesUnicode.append(0x180e);
	267	characterClass->m_matchesUnicode.append(0x2028);
	268	characterClass->m_matchesUnicode.append(0x2029);
	269	characterClass->m_matchesUnicode.append(0x202f);
	270	characterClass->m_matchesUnicode.append(0x205f);
	271	characterClass->m_matchesUnicode.append(0x3000);
	272	characterClass->m_rangesUnicode.append(CharacterRange(0x2000, 0x200a));
	273
	274	return characterClass;
	275	}
	276
	277	CharacterClass* wordcharCreate()
	278	{
	279	CharacterClass* characterClass = new CharacterClass();
	280
	281	characterClass->m_matches.append('_');
	282	characterClass->m_ranges.append(CharacterRange('0', '9'));
	283	characterClass->m_ranges.append(CharacterRange('A', 'Z'));
	284	characterClass->m_ranges.append(CharacterRange('a', 'z'));
	285
	286	return characterClass;
	287	}
	288
	289	CharacterClass* nondigitsCreate()
	290	{
	291	CharacterClass* characterClass = new CharacterClass();
	292
	293	characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
	294	characterClass->m_ranges.append(CharacterRange('9' + 1, 0x7f));
	295	characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
	296
	297	return characterClass;
	298	}
	299
	300	CharacterClass* nonspacesCreate()
	301	{
	302	CharacterClass* characterClass = new CharacterClass();
	303
	304	characterClass->m_ranges.append(CharacterRange(0, '\t' - 1));
	305	characterClass->m_ranges.append(CharacterRange('\r' + 1, ' ' - 1));
	306	characterClass->m_ranges.append(CharacterRange(' ' + 1, 0x7f));
	307	characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x009f));
	308	characterClass->m_rangesUnicode.append(CharacterRange(0x00a1, 0x167f));
	309	characterClass->m_rangesUnicode.append(CharacterRange(0x1681, 0x180d));
	310	characterClass->m_rangesUnicode.append(CharacterRange(0x180f, 0x1fff));
	311	characterClass->m_rangesUnicode.append(CharacterRange(0x200b, 0x2027));
	312	characterClass->m_rangesUnicode.append(CharacterRange(0x202a, 0x202e));
	313	characterClass->m_rangesUnicode.append(CharacterRange(0x2030, 0x205e));
	314	characterClass->m_rangesUnicode.append(CharacterRange(0x2060, 0x2fff));
	315	characterClass->m_rangesUnicode.append(CharacterRange(0x3001, 0xffff));
	316
	317	return characterClass;
	318	}
	319
	320	CharacterClass* nonwordcharCreate()
	321	{
	322	CharacterClass* characterClass = new CharacterClass();
	323
	324	characterClass->m_matches.append('`');
	325	characterClass->m_ranges.append(CharacterRange(0, '0' - 1));
	326	characterClass->m_ranges.append(CharacterRange('9' + 1, 'A' - 1));
	327	characterClass->m_ranges.append(CharacterRange('Z' + 1, '_' - 1));
	328	characterClass->m_ranges.append(CharacterRange('z' + 1, 0x7f));
	329	characterClass->m_rangesUnicode.append(CharacterRange(0x80, 0xffff));
	330
	331	return characterClass;
	332	}
	333
	334
	335	class RegexPatternConstructor {
	336	public:
	337	RegexPatternConstructor(RegexPattern& pattern)
	338	: m_pattern(pattern)
	339	, m_characterClassConstructor(pattern.m_ignoreCase)
	340	{
	341	}
	342
	343	~RegexPatternConstructor()
	344	{
	345	}
	346
	347	void reset()
	348	{
	349	m_pattern.reset();
	350	m_characterClassConstructor.reset();
	351	}
	352
	353	void assertionBOL()
	354	{
	355	m_alternative->m_terms.append(PatternTerm::BOL());
	356	}
	357	void assertionEOL()
	358	{
	359	m_alternative->m_terms.append(PatternTerm::EOL());
	360	}
	361	void assertionWordBoundary(bool invert)
	362	{
	363	m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
	364	}
	365
	366	void atomPatternCharacter(UChar ch)
	367	{
	368	// We handle case-insensitive checking of unicode characters which do have both
	369	// cases by handling them as if they were defined using a CharacterClass.
	370	if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
	371	atomCharacterClassBegin();
	372	atomCharacterClassAtom(ch);
	373	atomCharacterClassEnd();
	374	} else
	375	m_alternative->m_terms.append(PatternTerm(ch));
	376	}
	377
	378	void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
	379	{
	380	switch (classID) {
	381	case DigitClassID:
	382	m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
	383	break;
	384	case SpaceClassID:
	385	m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
	386	break;
	387	case WordClassID:
	388	m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
	389	break;
	390	case NewlineClassID:
	391	m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
	392	break;
	393	}
	394	}
	395
	396	void atomCharacterClassBegin(bool invert = false)
	397	{
	398	m_invertCharacterClass = invert;
	399	}
	400
	401	void atomCharacterClassAtom(UChar ch)
	402	{
	403	m_characterClassConstructor.putChar(ch);
	404	}
	405
	406	void atomCharacterClassRange(UChar begin, UChar end)
	407	{
	408	m_characterClassConstructor.putRange(begin, end);
	409	}
	410
	411	void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
	412	{
	413	ASSERT(classID != NewlineClassID);
	414
	415	switch (classID) {
	416	case DigitClassID:
	417	m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
	418	break;
	419
	420	case SpaceClassID:
	421	m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
	422	break;
	423
	424	case WordClassID:
	425	m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
	426	break;
	427
	428	default:
	429	ASSERT_NOT_REACHED();
	430	}
	431	}
	432
	433	void atomCharacterClassEnd()
	434	{
	435	CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
	436	m_pattern.m_userCharacterClasses.append(newCharacterClass);
	437	m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
	438	}
	439
	440	void atomParenthesesSubpatternBegin(bool capture = true)
	441	{
	442	unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
	443	if (capture)
	444	m_pattern.m_numSubpatterns++;
	445
	446	PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
	447	m_pattern.m_disjunctions.append(parenthesesDisjunction);
	448	m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture));
	449	m_alternative = parenthesesDisjunction->addNewAlternative();
	450	}
	451
	452	void atomParentheticalAssertionBegin(bool invert = false)
	453	{
	454	PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
	455	m_pattern.m_disjunctions.append(parenthesesDisjunction);
	456	m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert));
	457	m_alternative = parenthesesDisjunction->addNewAlternative();
	458	}
	459
	460	void atomParenthesesEnd()
	461	{
	462	ASSERT(m_alternative->m_parent);
	463	ASSERT(m_alternative->m_parent->m_parent);
	464	m_alternative = m_alternative->m_parent->m_parent;
	465
	466	m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
	467	}
	468
	469	void atomBackReference(unsigned subpatternId)
	470	{
	471	ASSERT(subpatternId);
	472	m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
	473
	474	if (subpatternId > m_pattern.m_numSubpatterns) {
	475	m_alternative->m_terms.append(PatternTerm::ForwardReference());
	476	return;
	477	}
	478
	479	PatternAlternative* currentAlternative = m_alternative;
	480	ASSERT(currentAlternative);
	481
	482	// Note to self: if we waited until the AST was baked, we could also remove forwards refs
	483	while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
	484	PatternTerm& term = currentAlternative->lastTerm();
	485	ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) \|\| (term.type == PatternTerm::TypeParentheticalAssertion));
	486
	487	if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) {
	488	m_alternative->m_terms.append(PatternTerm::ForwardReference());
	489	return;
	490	}
	491	}
	492
	493	m_alternative->m_terms.append(PatternTerm(subpatternId));
	494	}
	495
	496	PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction)
	497	{
	498	PatternDisjunction* newDisjunction = new PatternDisjunction();
	499
	500	newDisjunction->m_parent = disjunction->m_parent;
	501	for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
	502	PatternAlternative* alternative = disjunction->m_alternatives[alt];
	503	PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
	504	for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
	505	newAlternative->m_terms.append(copyTerm(alternative->m_terms[i]));
	506	}
	507
	508	m_pattern.m_disjunctions.append(newDisjunction);
	509	return newDisjunction;
	510	}
	511
	512	PatternTerm copyTerm(PatternTerm& term)
	513	{
	514	if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
	515	return PatternTerm(term);
	516
	517	PatternTerm termCopy = term;
	518	termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction);
	519	return termCopy;
	520	}
	521
	522	void quantifyAtom(unsigned min, unsigned max, bool greedy)
	523	{
	524	ASSERT(min <= max);
	525	ASSERT(m_alternative->m_terms.size());
	526
	527	if (!max) {
	528	m_alternative->removeLastTerm();
	529	return;
	530	}
	531
	532	PatternTerm& term = m_alternative->lastTerm();
	533	ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
	534	ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
	535
	536	// For any assertion with a zero minimum, not matching is valid and has no effect,
	537	// remove it. Otherwise, we need to match as least once, but there is no point
	538	// matching more than once, so remove the quantifier. It is not entirely clear
	539	// from the spec whether or not this behavior is correct, but I believe this
	540	// matches Firefox. :-/
	541	if (term.type == PatternTerm::TypeParentheticalAssertion) {
	542	if (!min)
	543	m_alternative->removeLastTerm();
	544	return;
	545	}
	546
	547	if (min == 0)
	548	term.quantify(max, greedy ? QuantifierGreedy : QuantifierNonGreedy);
	549	else if (min == max)
	550	term.quantify(min, QuantifierFixedCount);
	551	else {
	552	term.quantify(min, QuantifierFixedCount);
	553	m_alternative->m_terms.append(copyTerm(term));
	554	// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
	555	m_alternative->lastTerm().quantify((max == UINT_MAX) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
	556	if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
	557	m_alternative->lastTerm().parentheses.isCopy = true;
	558	}
	559	}
	560
	561	void disjunction()
	562	{
	563	m_alternative = m_alternative->m_parent->addNewAlternative();
	564	}
	565
	566	void regexBegin()
	567	{
	568	m_pattern.m_body = new PatternDisjunction();
	569	m_alternative = m_pattern.m_body->addNewAlternative();
	570	m_pattern.m_disjunctions.append(m_pattern.m_body);
	571	}
	572	void regexEnd()
	573	{
	574	}
	575	void regexError()
	576	{
	577	}
	578
	579	unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
	580	{
	581	alternative->m_hasFixedSize = true;
	582	unsigned currentInputPosition = initialInputPosition;
	583
	584	for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
	585	PatternTerm& term = alternative->m_terms[i];
	586
	587	switch (term.type) {
	588	case PatternTerm::TypeAssertionBOL:
	589	case PatternTerm::TypeAssertionEOL:
	590	case PatternTerm::TypeAssertionWordBoundary:
	591	term.inputPosition = currentInputPosition;
	592	break;
	593
	594	case PatternTerm::TypeBackReference:
	595	term.inputPosition = currentInputPosition;
	596	term.frameLocation = currentCallFrameSize;
	597	currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference;
	598	alternative->m_hasFixedSize = false;
	599	break;
	600
	601	case PatternTerm::TypeForwardReference:
	602	break;
	603
	604	case PatternTerm::TypePatternCharacter:
	605	term.inputPosition = currentInputPosition;
	606	if (term.quantityType != QuantifierFixedCount) {
	607	term.frameLocation = currentCallFrameSize;
	608	currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter;
	609	alternative->m_hasFixedSize = false;
	610	} else
	611	currentInputPosition += term.quantityCount;
	612	break;
	613
	614	case PatternTerm::TypeCharacterClass:
	615	term.inputPosition = currentInputPosition;
	616	if (term.quantityType != QuantifierFixedCount) {
	617	term.frameLocation = currentCallFrameSize;
	618	currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass;
	619	alternative->m_hasFixedSize = false;
	620	} else
	621	currentInputPosition += term.quantityCount;
	622	break;
	623
	624	case PatternTerm::TypeParenthesesSubpattern:
	625	// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
	626	term.frameLocation = currentCallFrameSize;
	627	if ((term.quantityCount == 1) && !term.parentheses.isCopy) {
	628	if (term.quantityType == QuantifierFixedCount) {
	629	currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
	630	currentInputPosition += term.parentheses.disjunction->m_minimumSize;
	631	} else {
	632	currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce;
	633	currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
	634	}
	635	term.inputPosition = currentInputPosition;
	636	} else {
	637	term.inputPosition = currentInputPosition;
	638	setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition);
	639	currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses;
	640	}
	641	// Fixed count of 1 could be accepted, if they have a fixed size AND if all alternatives are of the same length.
	642	alternative->m_hasFixedSize = false;
	643	break;
	644
	645	case PatternTerm::TypeParentheticalAssertion:
	646	term.inputPosition = currentInputPosition;
	647	term.frameLocation = currentCallFrameSize;
	648	currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition);
	649	break;
	650	}
	651	}
	652
	653	alternative->m_minimumSize = currentInputPosition - initialInputPosition;
	654	return currentCallFrameSize;
	655	}
	656
	657	unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
	658	{
	659	if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
	660	initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative;
	661
	662	unsigned minimumInputSize = UINT_MAX;
	663	unsigned maximumCallFrameSize = 0;
	664	bool hasFixedSize = true;
	665
	666	for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
	667	PatternAlternative* alternative = disjunction->m_alternatives[alt];
	668	unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
	669	minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
	670	maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
	671	hasFixedSize &= alternative->m_hasFixedSize;
	672	}
	673
	674	ASSERT(minimumInputSize != UINT_MAX);
	675	ASSERT(maximumCallFrameSize >= initialCallFrameSize);
	676
	677	disjunction->m_hasFixedSize = hasFixedSize;
	678	disjunction->m_minimumSize = minimumInputSize;
	679	disjunction->m_callFrameSize = maximumCallFrameSize;
	680	return maximumCallFrameSize;
	681	}
	682
	683	void setupOffsets()
	684	{
	685	setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
	686	}
	687
	688	private:
	689	RegexPattern& m_pattern;
	690	PatternAlternative* m_alternative;
	691	CharacterClassConstructor m_characterClassConstructor;
	692	bool m_invertCharacterClass;
	693	};
	694
	695
	696	const char* compileRegex(const UString& patternString, RegexPattern& pattern)
	697	{
	698	RegexPatternConstructor constructor(pattern);
	699
	700	if (const char* error = parse(constructor, patternString))
	701	return error;
	702
	703	// If the pattern contains illegal backreferences reset & reparse.
	704	// Quoting Netscape's "What's new in JavaScript 1.2",
	705	// "Note: if the number of left parentheses is less than the number specified
	706	// in \#, the \# is taken as an octal escape as described in the next row."
	707	if (pattern.containsIllegalBackReference()) {
	708	unsigned numSubpatterns = pattern.m_numSubpatterns;
	709
	710	constructor.reset();
	711	#if !ASSERT_DISABLED
	712	const char* error =
	713	#endif
	714	parse(constructor, patternString, numSubpatterns);
	715
	716	ASSERT(!error);
	717	ASSERT(numSubpatterns == pattern.m_numSubpatterns);
	718	}
	719
	720	constructor.setupOffsets();
	721
	722	return false;
	723	};
	724
	725
	726	} }
	727
	728	#endif