yarr/YarrPattern.cpp

   1 /*
   2  * Copyright (C) 2009 Apple Inc. All rights reserved.
   3  * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include "config.h"
  28 #include "YarrPattern.h"
  29
  30 #include "Yarr.h"
  31 #include "YarrCanonicalizeUCS2.h"
  32 #include "YarrParser.h"
  33 #include <wtf/Vector.h>
  34
  35 using namespace WTF;
  36
  37 namespace JSC { namespace Yarr {
  38
  39 #include "RegExpJitTables.h"
  40
  41 class CharacterClassConstructor {
  42 public:
  43     CharacterClassConstructor(bool isCaseInsensitive = false)
  44         : m_isCaseInsensitive(isCaseInsensitive)
  45     {
  46     }
  47
  48     void reset()
  49     {
  50         m_matches.clear();
  51         m_ranges.clear();
  52         m_matchesUnicode.clear();
  53         m_rangesUnicode.clear();
  54     }
  55
  56     void append(const CharacterClass* other)
  57     {
  58         for (size_t i = 0; i < other->m_matches.size(); ++i)
  59             addSorted(m_matches, other->m_matches[i]);
  60         for (size_t i = 0; i < other->m_ranges.size(); ++i)
  61             addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
  62         for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
  63             addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
  64         for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
  65             addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
  66     }
  67
  68     void putChar(UChar ch)
  69     {
  70         // Handle ascii cases.
  71         if (ch <= 0x7f) {
  72             if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
  73                 addSorted(m_matches, toASCIIUpper(ch));
  74                 addSorted(m_matches, toASCIILower(ch));
  75             } else
  76                 addSorted(m_matches, ch);
  77             return;
  78         }
  79
  80         // Simple case, not a case-insensitive match.
  81         if (!m_isCaseInsensitive) {
  82             addSorted(m_matchesUnicode, ch);
  83             return;
  84         }
  85
  86         // Add multiple matches, if necessary.
  87         UCS2CanonicalizationRange* info = rangeInfoFor(ch);
  88         if (info->type == CanonicalizeUnique)
  89             addSorted(m_matchesUnicode, ch);
  90         else
  91             putUnicodeIgnoreCase(ch, info);
  92     }
  93
  94     void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info)
  95     {
  96         ASSERT(m_isCaseInsensitive);
  97         ASSERT(ch > 0x7f);
  98         ASSERT(ch >= info->begin && ch <= info->end);
  99         ASSERT(info->type != CanonicalizeUnique);
 100         if (info->type == CanonicalizeSet) {
 101             for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
 102                 addSorted(m_matchesUnicode, ch);
 103         } else {
 104             addSorted(m_matchesUnicode, ch);
 105             addSorted(m_matchesUnicode, getCanonicalPair(info, ch));
 106         }
 107     }
 108
 109     void putRange(UChar lo, UChar hi)
 110     {
 111         if (lo <= 0x7f) {
 112             char asciiLo = lo;
 113             char asciiHi = std::min(hi, (UChar)0x7f);
 114             addSortedRange(m_ranges, lo, asciiHi);
 115
 116             if (m_isCaseInsensitive) {
 117                 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
 118                     addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
 119                 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
 120                     addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
 121             }
 122         }
 123         if (hi <= 0x7f)
 124             return;
 125
 126         lo = std::max(lo, (UChar)0x80);
 127         addSortedRange(m_rangesUnicode, lo, hi);
 128
 129         if (!m_isCaseInsensitive)
 130             return;
 131
 132         UCS2CanonicalizationRange* info = rangeInfoFor(lo);
 133         while (true) {
 134             // Handle the range [lo .. end]
 135             UChar end = std::min<UChar>(info->end, hi);
 136
 137             switch (info->type) {
 138             case CanonicalizeUnique:
 139                 // Nothing to do - no canonical equivalents.
 140                 break;
 141             case CanonicalizeSet: {
 142                 UChar ch;
 143                 for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set)
 144                     addSorted(m_matchesUnicode, ch);
 145                 break;
 146             }
 147             case CanonicalizeRangeLo:
 148                 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value);
 149                 break;
 150             case CanonicalizeRangeHi:
 151                 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value);
 152                 break;
 153             case CanonicalizeAlternatingAligned:
 154                 // Use addSortedRange since there is likely an abutting range to combine with.
 155                 if (lo & 1)
 156                     addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
 157                 if (!(end & 1))
 158                     addSortedRange(m_rangesUnicode, end + 1, end + 1);
 159                 break;
 160             case CanonicalizeAlternatingUnaligned:
 161                 // Use addSortedRange since there is likely an abutting range to combine with.
 162                 if (!(lo & 1))
 163                     addSortedRange(m_rangesUnicode, lo - 1, lo - 1);
 164                 if (end & 1)
 165                     addSortedRange(m_rangesUnicode, end + 1, end + 1);
 166                 break;
 167             }
 168
 169             if (hi == end)
 170                 return;
 171
 172             ++info;
 173             lo = info->begin;
 174         };
 175
 176     }
 177
 178     CharacterClass* charClass()
 179     {
 180         CharacterClass* characterClass = new CharacterClass(0);
 181
 182         characterClass->m_matches.swap(m_matches);
 183         characterClass->m_ranges.swap(m_ranges);
 184         characterClass->m_matchesUnicode.swap(m_matchesUnicode);
 185         characterClass->m_rangesUnicode.swap(m_rangesUnicode);
 186
 187         return characterClass;
 188     }
 189
 190 private:
 191     void addSorted(Vector<UChar>& matches, UChar ch)
 192     {
 193         unsigned pos = 0;
 194         unsigned range = matches.size();
 195
 196         // binary chop, find position to insert char.
 197         while (range) {
 198             unsigned index = range >> 1;
 199
 200             int val = matches[pos+index] - ch;
 201             if (!val)
 202                 return;
 203             else if (val > 0)
 204                 range = index;
 205             else {
 206                 pos += (index+1);
 207                 range -= (index+1);
 208             }
 209         }
 210
 211         if (pos == matches.size())
 212             matches.append(ch);
 213         else
 214             matches.insert(pos, ch);
 215     }
 216
 217     void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
 218     {
 219         unsigned end = ranges.size();
 220
 221         // Simple linear scan - I doubt there are that many ranges anyway...
 222         // feel free to fix this with something faster (eg binary chop).
 223         for (unsigned i = 0; i < end; ++i) {
 224             // does the new range fall before the current position in the array
 225             if (hi < ranges[i].begin) {
 226                 // optional optimization: concatenate appending ranges? - may not be worthwhile.
 227                 if (hi == (ranges[i].begin - 1)) {
 228                     ranges[i].begin = lo;
 229                     return;
 230                 }
 231                 ranges.insert(i, CharacterRange(lo, hi));
 232                 return;
 233             }
 234             // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
 235             // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
 236             // end of the last range they concatenate, which is just as good.
 237             if (lo <= (ranges[i].end + 1)) {
 238                 // found an intersect! we'll replace this entry in the array.
 239                 ranges[i].begin = std::min(ranges[i].begin, lo);
 240                 ranges[i].end = std::max(ranges[i].end, hi);
 241
 242                 // now check if the new range can subsume any subsequent ranges.
 243                 unsigned next = i+1;
 244                 // each iteration of the loop we will either remove something from the list, or break the loop.
 245                 while (next < ranges.size()) {
 246                     if (ranges[next].begin <= (ranges[i].end + 1)) {
 247                         // the next entry now overlaps / concatenates this one.
 248                         ranges[i].end = std::max(ranges[i].end, ranges[next].end);
 249                         ranges.remove(next);
 250                     } else
 251                         break;
 252                 }
 253
 254                 return;
 255             }
 256         }
 257
 258         // CharacterRange comes after all existing ranges.
 259         ranges.append(CharacterRange(lo, hi));
 260     }
 261
 262     bool m_isCaseInsensitive;
 263
 264     Vector<UChar> m_matches;
 265     Vector<CharacterRange> m_ranges;
 266     Vector<UChar> m_matchesUnicode;
 267     Vector<CharacterRange> m_rangesUnicode;
 268 };
 269
 270 class YarrPatternConstructor {
 271 public:
 272     YarrPatternConstructor(YarrPattern& pattern)
 273         : m_pattern(pattern)
 274         , m_characterClassConstructor(pattern.m_ignoreCase)
 275         , m_invertParentheticalAssertion(false)
 276     {
 277         m_pattern.m_body = new PatternDisjunction();
 278         m_alternative = m_pattern.m_body->addNewAlternative();
 279         m_pattern.m_disjunctions.append(m_pattern.m_body);
 280     }
 281
 282     ~YarrPatternConstructor()
 283     {
 284     }
 285
 286     void reset()
 287     {
 288         m_pattern.reset();
 289         m_characterClassConstructor.reset();
 290
 291         m_pattern.m_body = new PatternDisjunction();
 292         m_alternative = m_pattern.m_body->addNewAlternative();
 293         m_pattern.m_disjunctions.append(m_pattern.m_body);
 294     }
 295
 296     void assertionBOL()
 297     {
 298         if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
 299             m_alternative->m_startsWithBOL = true;
 300             m_alternative->m_containsBOL = true;
 301             m_pattern.m_containsBOL = true;
 302         }
 303         m_alternative->m_terms.append(PatternTerm::BOL());
 304     }
 305     void assertionEOL()
 306     {
 307         m_alternative->m_terms.append(PatternTerm::EOL());
 308     }
 309     void assertionWordBoundary(bool invert)
 310     {
 311         m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
 312     }
 313
 314     void atomPatternCharacter(UChar ch)
 315     {
 316         // We handle case-insensitive checking of unicode characters which do have both
 317         // cases by handling them as if they were defined using a CharacterClass.
 318         if (!m_pattern.m_ignoreCase || isASCII(ch)) {
 319             m_alternative->m_terms.append(PatternTerm(ch));
 320             return;
 321         }
 322
 323         UCS2CanonicalizationRange* info = rangeInfoFor(ch);
 324         if (info->type == CanonicalizeUnique) {
 325             m_alternative->m_terms.append(PatternTerm(ch));
 326             return;
 327         }
 328
 329         m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
 330         CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
 331         m_pattern.m_userCharacterClasses.append(newCharacterClass);
 332         m_alternative->m_terms.append(PatternTerm(newCharacterClass, false));
 333     }
 334
 335     void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
 336     {
 337         switch (classID) {
 338         case DigitClassID:
 339             m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
 340             break;
 341         case SpaceClassID:
 342             m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
 343             break;
 344         case WordClassID:
 345             m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
 346             break;
 347         case NewlineClassID:
 348             m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
 349             break;
 350         }
 351     }
 352
 353     void atomCharacterClassBegin(bool invert = false)
 354     {
 355         m_invertCharacterClass = invert;
 356     }
 357
 358     void atomCharacterClassAtom(UChar ch)
 359     {
 360         m_characterClassConstructor.putChar(ch);
 361     }
 362
 363     void atomCharacterClassRange(UChar begin, UChar end)
 364     {
 365         m_characterClassConstructor.putRange(begin, end);
 366     }
 367
 368     void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
 369     {
 370         ASSERT(classID != NewlineClassID);
 371
 372         switch (classID) {
 373         case DigitClassID:
 374             m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
 375             break;
 376
 377         case SpaceClassID:
 378             m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
 379             break;
 380
 381         case WordClassID:
 382             m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
 383             break;
 384
 385         default:
 386             ASSERT_NOT_REACHED();
 387         }
 388     }
 389
 390     void atomCharacterClassEnd()
 391     {
 392         CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
 393         m_pattern.m_userCharacterClasses.append(newCharacterClass);
 394         m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
 395     }
 396
 397     void atomParenthesesSubpatternBegin(bool capture = true)
 398     {
 399         unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
 400         if (capture)
 401             m_pattern.m_numSubpatterns++;
 402
 403         PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
 404         m_pattern.m_disjunctions.append(parenthesesDisjunction);
 405         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
 406         m_alternative = parenthesesDisjunction->addNewAlternative();
 407     }
 408
 409     void atomParentheticalAssertionBegin(bool invert = false)
 410     {
 411         PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
 412         m_pattern.m_disjunctions.append(parenthesesDisjunction);
 413         m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
 414         m_alternative = parenthesesDisjunction->addNewAlternative();
 415         m_invertParentheticalAssertion = invert;
 416     }
 417
 418     void atomParenthesesEnd()
 419     {
 420         ASSERT(m_alternative->m_parent);
 421         ASSERT(m_alternative->m_parent->m_parent);
 422
 423         PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
 424         m_alternative = m_alternative->m_parent->m_parent;
 425
 426         PatternTerm& lastTerm = m_alternative->lastTerm();
 427
 428         unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
 429         unsigned numBOLAnchoredAlts = 0;
 430
 431         for (unsigned i = 0; i < numParenAlternatives; i++) {
 432             // Bubble up BOL flags
 433             if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
 434                 numBOLAnchoredAlts++;
 435         }
 436
 437         if (numBOLAnchoredAlts) {
 438             m_alternative->m_containsBOL = true;
 439             // If all the alternatives in parens start with BOL, then so does this one
 440             if (numBOLAnchoredAlts == numParenAlternatives)
 441                 m_alternative->m_startsWithBOL = true;
 442         }
 443
 444         lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
 445         m_invertParentheticalAssertion = false;
 446     }
 447
 448     void atomBackReference(unsigned subpatternId)
 449     {
 450         ASSERT(subpatternId);
 451         m_pattern.m_containsBackreferences = true;
 452         m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
 453
 454         if (subpatternId > m_pattern.m_numSubpatterns) {
 455             m_alternative->m_terms.append(PatternTerm::ForwardReference());
 456             return;
 457         }
 458
 459         PatternAlternative* currentAlternative = m_alternative;
 460         ASSERT(currentAlternative);
 461
 462         // Note to self: if we waited until the AST was baked, we could also remove forwards refs
 463         while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
 464             PatternTerm& term = currentAlternative->lastTerm();
 465             ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
 466
 467             if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
 468                 m_alternative->m_terms.append(PatternTerm::ForwardReference());
 469                 return;
 470             }
 471         }
 472
 473         m_alternative->m_terms.append(PatternTerm(subpatternId));
 474     }
 475
 476     // deep copy the argument disjunction.  If filterStartsWithBOL is true,
 477     // skip alternatives with m_startsWithBOL set true.
 478     PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
 479     {
 480         PatternDisjunction* newDisjunction = 0;
 481         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
 482             PatternAlternative* alternative = disjunction->m_alternatives[alt];
 483             if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
 484                 if (!newDisjunction) {
 485                     newDisjunction = new PatternDisjunction();
 486                     newDisjunction->m_parent = disjunction->m_parent;
 487                 }
 488                 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
 489                 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
 490                     newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
 491             }
 492         }
 493
 494         if (newDisjunction)
 495             m_pattern.m_disjunctions.append(newDisjunction);
 496         return newDisjunction;
 497     }
 498
 499     PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
 500     {
 501         if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
 502             return PatternTerm(term);
 503
 504         PatternTerm termCopy = term;
 505         termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
 506         return termCopy;
 507     }
 508
 509     void quantifyAtom(unsigned min, unsigned max, bool greedy)
 510     {
 511         ASSERT(min <= max);
 512         ASSERT(m_alternative->m_terms.size());
 513
 514         if (!max) {
 515             m_alternative->removeLastTerm();
 516             return;
 517         }
 518
 519         PatternTerm& term = m_alternative->lastTerm();
 520         ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
 521         ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
 522
 523         if (term.type == PatternTerm::TypeParentheticalAssertion) {
 524             // If an assertion is quantified with a minimum count of zero, it can simply be removed.
 525             // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
 526             // results in any input being consumed, however the continuation passed to the assertion
 527             // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
 528             // reject all zero length matches (see step 2.1). A match from the continuation of the
 529             // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
 530             // this is that matches from the assertion are not required, and won't be accepted anyway,
 531             // so no need to ever run it.
 532             if (!min)
 533                 m_alternative->removeLastTerm();
 534             // We never need to run an assertion more than once. Subsequent interations will be run
 535             // with the same start index (since assertions are non-capturing) and the same captures
 536             // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
 537             // same result and captures. If the first match succeeds then the subsequent (min - 1)
 538             // matches will too. Any additional optional matches will fail (on the same basis as the
 539             // minimum zero quantified assertions, above), but this will still result in a match.
 540             return;
 541         }
 542
 543         if (min == 0)
 544             term.quantify(max, greedy   ? QuantifierGreedy : QuantifierNonGreedy);
 545         else if (min == max)
 546             term.quantify(min, QuantifierFixedCount);
 547         else {
 548             term.quantify(min, QuantifierFixedCount);
 549             m_alternative->m_terms.append(copyTerm(term));
 550             // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
 551             m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
 552             if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
 553                 m_alternative->lastTerm().parentheses.isCopy = true;
 554         }
 555     }
 556
 557     void disjunction()
 558     {
 559         m_alternative = m_alternative->m_parent->addNewAlternative();
 560     }
 561
 562     unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
 563     {
 564         alternative->m_hasFixedSize = true;
 565         Checked<unsigned> currentInputPosition = initialInputPosition;
 566
 567         for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
 568             PatternTerm& term = alternative->m_terms[i];
 569
 570             switch (term.type) {
 571             case PatternTerm::TypeAssertionBOL:
 572             case PatternTerm::TypeAssertionEOL:
 573             case PatternTerm::TypeAssertionWordBoundary:
 574                 term.inputPosition = currentInputPosition.unsafeGet();
 575                 break;
 576
 577             case PatternTerm::TypeBackReference:
 578                 term.inputPosition = currentInputPosition.unsafeGet();
 579                 term.frameLocation = currentCallFrameSize;
 580                 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
 581                 alternative->m_hasFixedSize = false;
 582                 break;
 583
 584             case PatternTerm::TypeForwardReference:
 585                 break;
 586
 587             case PatternTerm::TypePatternCharacter:
 588                 term.inputPosition = currentInputPosition.unsafeGet();
 589                 if (term.quantityType != QuantifierFixedCount) {
 590                     term.frameLocation = currentCallFrameSize;
 591                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
 592                     alternative->m_hasFixedSize = false;
 593                 } else
 594                     currentInputPosition += term.quantityCount;
 595                 break;
 596
 597             case PatternTerm::TypeCharacterClass:
 598                 term.inputPosition = currentInputPosition.unsafeGet();
 599                 if (term.quantityType != QuantifierFixedCount) {
 600                     term.frameLocation = currentCallFrameSize;
 601                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
 602                     alternative->m_hasFixedSize = false;
 603                 } else
 604                     currentInputPosition += term.quantityCount;
 605                 break;
 606
 607             case PatternTerm::TypeParenthesesSubpattern:
 608                 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
 609                 term.frameLocation = currentCallFrameSize;
 610                 if (term.quantityCount == 1 && !term.parentheses.isCopy) {
 611                     if (term.quantityType != QuantifierFixedCount)
 612                         currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
 613                     currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
 614                     // If quantity is fixed, then pre-check its minimum size.
 615                     if (term.quantityType == QuantifierFixedCount)
 616                         currentInputPosition += term.parentheses.disjunction->m_minimumSize;
 617                     term.inputPosition = currentInputPosition.unsafeGet();
 618                 } else if (term.parentheses.isTerminal) {
 619                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
 620                     currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition.unsafeGet());
 621                     term.inputPosition = currentInputPosition.unsafeGet();
 622                 } else {
 623                     term.inputPosition = currentInputPosition.unsafeGet();
 624                     setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition.unsafeGet());
 625                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
 626                 }
 627                 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
 628                 alternative->m_hasFixedSize = false;
 629                 break;
 630
 631             case PatternTerm::TypeParentheticalAssertion:
 632                 term.inputPosition = currentInputPosition.unsafeGet();
 633                 term.frameLocation = currentCallFrameSize;
 634                 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition.unsafeGet());
 635                 break;
 636
 637             case PatternTerm::TypeDotStarEnclosure:
 638                 alternative->m_hasFixedSize = false;
 639                 term.inputPosition = initialInputPosition;
 640                 break;
 641             }
 642         }
 643
 644         alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
 645         return currentCallFrameSize;
 646     }
 647
 648     unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
 649     {
 650         if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
 651             initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
 652
 653         unsigned minimumInputSize = UINT_MAX;
 654         unsigned maximumCallFrameSize = 0;
 655         bool hasFixedSize = true;
 656
 657         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
 658             PatternAlternative* alternative = disjunction->m_alternatives[alt];
 659             unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
 660             minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
 661             maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
 662             hasFixedSize &= alternative->m_hasFixedSize;
 663         }
 664
 665         ASSERT(minimumInputSize != UINT_MAX);
 666         ASSERT(maximumCallFrameSize >= initialCallFrameSize);
 667
 668         disjunction->m_hasFixedSize = hasFixedSize;
 669         disjunction->m_minimumSize = minimumInputSize;
 670         disjunction->m_callFrameSize = maximumCallFrameSize;
 671         return maximumCallFrameSize;
 672     }
 673
 674     void setupOffsets()
 675     {
 676         setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
 677     }
 678
 679     // This optimization identifies sets of parentheses that we will never need to backtrack.
 680     // In these cases we do not need to store state from prior iterations.
 681     // We can presently avoid backtracking for:
 682     //   * where the parens are at the end of the regular expression (last term in any of the
 683     //     alternatives of the main body disjunction).
 684     //   * where the parens are non-capturing, and quantified unbounded greedy (*).
 685     //   * where the parens do not contain any capturing subpatterns.
 686     void checkForTerminalParentheses()
 687     {
 688         // This check is much too crude; should be just checking whether the candidate
 689         // node contains nested capturing subpatterns, not the whole expression!
 690         if (m_pattern.m_numSubpatterns)
 691             return;
 692
 693         Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
 694         for (size_t i = 0; i < alternatives.size(); ++i) {
 695             Vector<PatternTerm>& terms = alternatives[i]->m_terms;
 696             if (terms.size()) {
 697                 PatternTerm& term = terms.last();
 698                 if (term.type == PatternTerm::TypeParenthesesSubpattern
 699                     && term.quantityType == QuantifierGreedy
 700                     && term.quantityCount == quantifyInfinite
 701                     && !term.capture())
 702                     term.parentheses.isTerminal = true;
 703             }
 704         }
 705     }
 706
 707     void optimizeBOL()
 708     {
 709         // Look for expressions containing beginning of line (^) anchoring and unroll them.
 710         // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
 711         // This code relies on the parsing code tagging alternatives with m_containsBOL and
 712         // m_startsWithBOL and rolling those up to containing alternatives.
 713         // At this point, this is only valid for non-multiline expressions.
 714         PatternDisjunction* disjunction = m_pattern.m_body;
 715
 716         if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
 717             return;
 718
 719         PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
 720
 721         // Set alternatives in disjunction to "onceThrough"
 722         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
 723             disjunction->m_alternatives[alt]->setOnceThrough();
 724
 725         if (loopDisjunction) {
 726             // Move alternatives from loopDisjunction to disjunction
 727             for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
 728                 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
 729
 730             loopDisjunction->m_alternatives.clear();
 731         }
 732     }
 733
 734     bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex)
 735     {
 736         Vector<PatternTerm>& terms = alternative->m_terms;
 737
 738         for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) {
 739             PatternTerm& term = terms[termIndex];
 740
 741             if (term.m_capture)
 742                 return true;
 743
 744             if (term.type == PatternTerm::TypeParenthesesSubpattern) {
 745                 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
 746                 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
 747                     if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
 748                         return true;
 749                 }
 750             }
 751         }
 752
 753         return false;
 754     }
 755
 756     // This optimization identifies alternatives in the form of
 757     // [^].*[?]<expression>.*[$] for expressions that don't have any
 758     // capturing terms. The alternative is changed to <expression>
 759     // followed by processing of the dot stars to find and adjust the
 760     // beginning and the end of the match.
 761     void optimizeDotStarWrappedExpressions()
 762     {
 763         Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
 764         if (alternatives.size() != 1)
 765             return;
 766
 767         PatternAlternative* alternative = alternatives[0];
 768         Vector<PatternTerm>& terms = alternative->m_terms;
 769         if (terms.size() >= 3) {
 770             bool startsWithBOL = false;
 771             bool endsWithEOL = false;
 772             size_t termIndex, firstExpressionTerm, lastExpressionTerm;
 773
 774             termIndex = 0;
 775             if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
 776                 startsWithBOL = true;
 777                 ++termIndex;
 778             }
 779
 780             PatternTerm& firstNonAnchorTerm = terms[termIndex];
 781             if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
 782                 return;
 783
 784             firstExpressionTerm = termIndex + 1;
 785
 786             termIndex = terms.size() - 1;
 787             if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
 788                 endsWithEOL = true;
 789                 --termIndex;
 790             }
 791
 792             PatternTerm& lastNonAnchorTerm = terms[termIndex];
 793             if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
 794                 return;
 795
 796             lastExpressionTerm = termIndex - 1;
 797
 798             if (firstExpressionTerm > lastExpressionTerm)
 799                 return;
 800
 801             if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) {
 802                 for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex)
 803                     terms.remove(termIndex);
 804
 805                 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
 806                     terms.remove(termIndex - 1);
 807
 808                 terms.append(PatternTerm(startsWithBOL, endsWithEOL));
 809
 810                 m_pattern.m_containsBOL = false;
 811             }
 812         }
 813     }
 814
 815 private:
 816     YarrPattern& m_pattern;
 817     PatternAlternative* m_alternative;
 818     CharacterClassConstructor m_characterClassConstructor;
 819     bool m_invertCharacterClass;
 820     bool m_invertParentheticalAssertion;
 821 };
 822
 823 const char* YarrPattern::compile(const UString& patternString)
 824 {
 825     YarrPatternConstructor constructor(*this);
 826
 827     if (const char* error = parse(constructor, patternString))
 828         return error;
 829
 830     // If the pattern contains illegal backreferences reset & reparse.
 831     // Quoting Netscape's "What's new in JavaScript 1.2",
 832     //      "Note: if the number of left parentheses is less than the number specified
 833     //       in \#, the \# is taken as an octal escape as described in the next row."
 834     if (containsIllegalBackReference()) {
 835         unsigned numSubpatterns = m_numSubpatterns;
 836
 837         constructor.reset();
 838 #if !ASSERT_DISABLED
 839         const char* error =
 840 #endif
 841             parse(constructor, patternString, numSubpatterns);
 842
 843         ASSERT(!error);
 844         ASSERT(numSubpatterns == m_numSubpatterns);
 845     }
 846
 847     constructor.checkForTerminalParentheses();
 848     constructor.optimizeDotStarWrappedExpressions();
 849     constructor.optimizeBOL();
 850
 851     constructor.setupOffsets();
 852
 853     return 0;
 854 }
 855
 856 YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error)
 857     : m_ignoreCase(ignoreCase)
 858     , m_multiline(multiline)
 859     , m_containsBackreferences(false)
 860     , m_containsBOL(false)
 861     , m_numSubpatterns(0)
 862     , m_maxBackReference(0)
 863     , newlineCached(0)
 864     , digitsCached(0)
 865     , spacesCached(0)
 866     , wordcharCached(0)
 867     , nondigitsCached(0)
 868     , nonspacesCached(0)
 869     , nonwordcharCached(0)
 870 {
 871     *error = compile(pattern);
 872 }
 873
 874 } }