2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org) 
   3  *  Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved. 
   4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) 
   6  *  This library is free software; you can redistribute it and/or 
   7  *  modify it under the terms of the GNU Library General Public 
   8  *  License as published by the Free Software Foundation; either 
   9  *  version 2 of the License, or (at your option) any later version. 
  11  *  This library is distributed in the hope that it will be useful, 
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
  14  *  Library General Public License for more details. 
  16  *  You should have received a copy of the GNU Library General Public License 
  17  *  along with this library; see the file COPYING.LIB.  If not, write to 
  18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 
  19  *  Boston, MA 02110-1301, USA. 
  26 #include "JSFunction.h" 
  27 #include "JSGlobalObjectFunctions.h" 
  34 #include <wtf/Assertions.h> 
  37 using namespace Unicode
; 
  39 // We can't specify the namespace in yacc's C output, so do it here instead. 
  47 #include "Lexer.lut.h" 
  49 // A bridge for yacc from the C world to the C++ world. 
  50 int jscyylex(void* lvalp
, void* llocp
, void* globalData
) 
  52     return static_cast<JSGlobalData
*>(globalData
)->lexer
->lex(lvalp
, llocp
); 
  57 static const UChar byteOrderMark 
= 0xFEFF; 
  59 Lexer::Lexer(JSGlobalData
* globalData
) 
  60     : m_isReparsing(false) 
  61     , m_globalData(globalData
) 
  62     , m_keywordTable(JSC::mainTable
) 
  64     m_buffer8
.reserveInitialCapacity(initialReadBufferCapacity
); 
  65     m_buffer16
.reserveInitialCapacity(initialReadBufferCapacity
); 
  70     m_keywordTable
.deleteTable(); 
  73 inline const UChar
* Lexer::currentCharacter() const 
  78 inline int Lexer::currentOffset() const 
  80     return currentCharacter() - m_codeStart
; 
  83 ALWAYS_INLINE 
void Lexer::shift1() 
  88     if (LIKELY(m_code 
< m_codeEnd
)) 
  96 ALWAYS_INLINE 
void Lexer::shift2() 
 100     if (LIKELY(m_code 
+ 1 < m_codeEnd
)) { 
 104         m_next2 
= m_code 
< m_codeEnd 
? m_code
[0] : -1; 
 111 ALWAYS_INLINE 
void Lexer::shift3() 
 114     if (LIKELY(m_code 
+ 2 < m_codeEnd
)) { 
 119         m_next1 
= m_code 
< m_codeEnd 
? m_code
[0] : -1; 
 120         m_next2 
= m_code 
+ 1 < m_codeEnd 
? m_code
[1] : -1; 
 127 ALWAYS_INLINE 
void Lexer::shift4() 
 129     if (LIKELY(m_code 
+ 3 < m_codeEnd
)) { 
 130         m_current 
= m_code
[0]; 
 135         m_current 
= m_code 
< m_codeEnd 
? m_code
[0] : -1; 
 136         m_next1 
= m_code 
+ 1 < m_codeEnd 
? m_code
[1] : -1; 
 137         m_next2 
= m_code 
+ 2 < m_codeEnd 
? m_code
[2] : -1; 
 144 void Lexer::setCode(const SourceCode
& source
) 
 146     m_lineNumber 
= source
.firstLine(); 
 150     const UChar
* data 
= source
.provider()->data(); 
 154     m_code 
= data 
+ source
.startOffset(); 
 155     m_codeEnd 
= data 
+ source
.endOffset(); 
 157     m_atLineStart 
= true; 
 159     // ECMA-262 calls for stripping all Cf characters, but we only strip BOM characters. 
 160     // See <https://bugs.webkit.org/show_bug.cgi?id=4931> for details. 
 161     if (source
.provider()->hasBOMs()) { 
 162         for (const UChar
* p 
= m_codeStart
; p 
< m_codeEnd
; ++p
) { 
 163             if (UNLIKELY(*p 
== byteOrderMark
)) { 
 164                 copyCodeWithoutBOMs(); 
 170     // Read the first characters into the 4-character buffer. 
 172     ASSERT(currentOffset() == source
.startOffset()); 
 175 void Lexer::copyCodeWithoutBOMs() 
 177     // Note: In this case, the character offset data for debugging will be incorrect. 
 178     // If it's important to correctly debug code with extraneous BOMs, then the caller 
 179     // should strip the BOMs when creating the SourceProvider object and do its own 
 180     // mapping of offsets within the stripped text to original text offset. 
 182     m_codeWithoutBOMs
.reserveCapacity(m_codeEnd 
- m_code
); 
 183     for (const UChar
* p 
= m_code
; p 
< m_codeEnd
; ++p
) { 
 185         if (c 
!= byteOrderMark
) 
 186             m_codeWithoutBOMs
.append(c
); 
 188     ptrdiff_t startDelta 
= m_codeStart 
- m_code
; 
 189     m_code 
= m_codeWithoutBOMs
.data(); 
 190     m_codeStart 
= m_code 
+ startDelta
; 
 191     m_codeEnd 
= m_codeWithoutBOMs
.data() + m_codeWithoutBOMs
.size(); 
 194 void Lexer::shiftLineTerminator() 
 196     ASSERT(isLineTerminator(m_current
)); 
 198     // Allow both CRLF and LFCR. 
 199     if (m_current 
+ m_next1 
== '\n' + '\r') 
 207 ALWAYS_INLINE Identifier
* Lexer::makeIdentifier(const UChar
* characters
, size_t length
) 
 209     m_identifiers
.append(Identifier(m_globalData
, characters
, length
)); 
 210     return &m_identifiers
.last(); 
 213 inline bool Lexer::lastTokenWasRestrKeyword() const 
 215     return m_lastToken 
== CONTINUE 
|| m_lastToken 
== BREAK 
|| m_lastToken 
== RETURN 
|| m_lastToken 
== THROW
; 
 218 static NEVER_INLINE 
bool isNonASCIIIdentStart(int c
) 
 220     return category(c
) & (Letter_Uppercase 
| Letter_Lowercase 
| Letter_Titlecase 
| Letter_Modifier 
| Letter_Other
); 
 223 static inline bool isIdentStart(int c
) 
 225     return isASCII(c
) ? isASCIIAlpha(c
) || c 
== '$' || c 
== '_' : isNonASCIIIdentStart(c
); 
 228 static NEVER_INLINE 
bool isNonASCIIIdentPart(int c
) 
 230     return category(c
) & (Letter_Uppercase 
| Letter_Lowercase 
| Letter_Titlecase 
| Letter_Modifier 
| Letter_Other
 
 231         | Mark_NonSpacing 
| Mark_SpacingCombining 
| Number_DecimalDigit 
| Punctuation_Connector
); 
 234 static inline bool isIdentPart(int c
) 
 236     return isASCII(c
) ? isASCIIAlphanumeric(c
) || c 
== '$' || c 
== '_' : isNonASCIIIdentPart(c
); 
 239 static inline int singleEscape(int c
) 
 259 inline void Lexer::record8(int c
) 
 263     m_buffer8
.append(static_cast<char>(c
)); 
 266 inline void Lexer::record16(UChar c
) 
 268     m_buffer16
.append(c
); 
 271 inline void Lexer::record16(int c
) 
 274     ASSERT(c 
<= USHRT_MAX
); 
 275     record16(UChar(static_cast<unsigned short>(c
))); 
 278 int Lexer::lex(void* p1
, void* p2
) 
 281     ASSERT(m_buffer8
.isEmpty()); 
 282     ASSERT(m_buffer16
.isEmpty()); 
 284     YYSTYPE
* lvalp 
= static_cast<YYSTYPE
*>(p1
); 
 285     YYLTYPE
* llocp 
= static_cast<YYLTYPE
*>(p2
); 
 287     m_terminator 
= false; 
 290     while (isWhiteSpace(m_current
)) 
 293     int startOffset 
= currentOffset(); 
 295     if (m_current 
== -1) { 
 296         if (!m_terminator 
&& !m_delimited 
&& !m_isReparsing
) { 
 297             // automatic semicolon insertion if program incomplete 
 307             if (m_next1 
== '>' && m_next2 
== '>') { 
 308                 if (m_next3 
== '=') { 
 310                     token 
= URSHIFTEQUAL
; 
 317             if (m_next1 
== '>') { 
 318                 if (m_next2 
== '=') { 
 327             if (m_next1 
== '=') { 
 336             if (m_next1 
== '=') { 
 337                 if (m_next2 
== '=') { 
 350             if (m_next1 
== '=') { 
 351                 if (m_next2 
== '=') { 
 364             if (m_next1 
== '!' && m_next2 
== '-' && m_next3 
== '-') { 
 365                 // <!-- marks the beginning of a line comment (for www usage) 
 367                 goto inSingleLineComment
; 
 369             if (m_next1 
== '<') { 
 370                 if (m_next2 
== '=') { 
 379             if (m_next1 
== '=') { 
 388             if (m_next1 
== '+') { 
 391                     token 
= AUTOPLUSPLUS
; 
 397             if (m_next1 
== '=') { 
 406             if (m_next1 
== '-') { 
 407                 if (m_atLineStart 
&& m_next2 
== '>') { 
 409                     goto inSingleLineComment
; 
 413                     token 
= AUTOMINUSMINUS
; 
 419             if (m_next1 
== '=') { 
 428             if (m_next1 
== '=') { 
 437             if (m_next1 
== '/') { 
 439                 goto inSingleLineComment
; 
 442                 goto inMultiLineComment
; 
 443             if (m_next1 
== '=') { 
 452             if (m_next1 
== '&') { 
 457             if (m_next1 
== '=') { 
 466             if (m_next1 
== '=') { 
 475             if (m_next1 
== '=') { 
 484             if (m_next1 
== '=') { 
 489             if (m_next1 
== '|') { 
 498             if (isASCIIDigit(m_next1
)) { 
 501                 goto inNumberAfterDecimalPoint
; 
 523             lvalp
->intValue 
= currentOffset(); 
 528             lvalp
->intValue 
= currentOffset(); 
 534             goto startIdentifierWithBackslash
; 
 536             goto startNumberWithZeroDigit
; 
 551             if (isIdentStart(m_current
)) 
 552                 goto startIdentifierOrKeyword
; 
 553             if (isLineTerminator(m_current
)) { 
 554                 shiftLineTerminator(); 
 555                 m_atLineStart 
= true; 
 557                 if (lastTokenWasRestrKeyword()) { 
 566     m_atLineStart 
= false; 
 570     int stringQuoteCharacter 
= m_current
; 
 573     const UChar
* stringStart 
= currentCharacter(); 
 574     while (m_current 
!= stringQuoteCharacter
) { 
 575         // Fast check for characters that require special handling. 
 576         // Catches -1, \n, \r, \, 0x2028, and 0x2029 as efficiently 
 577         // as possible, and lets through all common ASCII characters. 
 578         if (UNLIKELY(m_current 
== '\\') || UNLIKELY(((static_cast<unsigned>(m_current
) - 0xE) & 0x2000))) { 
 579             m_buffer16
.append(stringStart
, currentCharacter() - stringStart
); 
 584     lvalp
->ident 
= makeIdentifier(stringStart
, currentCharacter() - stringStart
); 
 586     m_atLineStart 
= false; 
 592     while (m_current 
!= stringQuoteCharacter
) { 
 593         if (m_current 
== '\\') 
 594             goto inStringEscapeSequence
; 
 595         if (UNLIKELY(isLineTerminator(m_current
))) 
 597         if (UNLIKELY(m_current 
== -1)) 
 604 inStringEscapeSequence
: 
 606     if (m_current 
== 'x') { 
 608         if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
)) { 
 609             record16(convertHex(m_current
, m_next1
)); 
 614         if (m_current 
== stringQuoteCharacter
) 
 618     if (m_current 
== 'u') { 
 620         if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
) && isASCIIHexDigit(m_next2
) && isASCIIHexDigit(m_next3
)) { 
 621             record16(convertUnicode(m_current
, m_next1
, m_next2
, m_next3
)); 
 625         if (m_current 
== stringQuoteCharacter
) { 
 631     if (isASCIIOctalDigit(m_current
)) { 
 632         if (m_current 
>= '0' && m_current 
<= '3' && isASCIIOctalDigit(m_next1
) && isASCIIOctalDigit(m_next2
)) { 
 633             record16((m_current 
- '0') * 64 + (m_next1 
- '0') * 8 + m_next2 
- '0'); 
 637         if (isASCIIOctalDigit(m_next1
)) { 
 638             record16((m_current 
- '0') * 8 + m_next1 
- '0'); 
 642         record16(m_current 
- '0'); 
 646     if (isLineTerminator(m_current
)) { 
 647         shiftLineTerminator(); 
 650     record16(singleEscape(m_current
)); 
 655 startIdentifierWithBackslash
: 
 657     if (UNLIKELY(m_current 
!= 'u')) 
 660     if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
))) 
 662     token 
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
); 
 663     if (UNLIKELY(!isIdentStart(token
))) 
 665     goto inIdentifierAfterCharacterCheck
; 
 667 startIdentifierOrKeyword
: { 
 668     const UChar
* identifierStart 
= currentCharacter(); 
 670     while (isIdentPart(m_current
)) 
 672     if (LIKELY(m_current 
!= '\\')) { 
 673         lvalp
->ident 
= makeIdentifier(identifierStart
, currentCharacter() - identifierStart
); 
 674         goto doneIdentifierOrKeyword
; 
 676     m_buffer16
.append(identifierStart
, currentCharacter() - identifierStart
); 
 681         if (UNLIKELY(m_current 
!= 'u')) 
 684         if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
))) 
 686         token 
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
); 
 687         if (UNLIKELY(!isIdentPart(token
))) 
 689 inIdentifierAfterCharacterCheck
: 
 693         while (isIdentPart(m_current
)) { 
 697     } while (UNLIKELY(m_current 
== '\\')); 
 701     while (!isLineTerminator(m_current
)) { 
 702         if (UNLIKELY(m_current 
== -1)) 
 706     shiftLineTerminator(); 
 707     m_atLineStart 
= true; 
 709     if (lastTokenWasRestrKeyword()) 
 715     while (m_current 
!= '*' || m_next1 
!= '/') { 
 716         if (isLineTerminator(m_current
)) 
 717             shiftLineTerminator(); 
 720             if (UNLIKELY(m_current 
== -1)) 
 725     m_atLineStart 
= false; 
 728 startNumberWithZeroDigit
: 
 730     if ((m_current 
| 0x20) == 'x' && isASCIIHexDigit(m_next1
)) { 
 734     if (m_current 
== '.') { 
 738         goto inNumberAfterDecimalPoint
; 
 740     if ((m_current 
| 0x20) == 'e') { 
 744         goto inExponentIndicator
; 
 746     if (isASCIIOctalDigit(m_current
)) 
 748     if (isASCIIDigit(m_current
)) 
 750     lvalp
->doubleValue 
= 0; 
 753 inNumberAfterDecimalPoint
: 
 754     while (isASCIIDigit(m_current
)) { 
 758     if ((m_current 
| 0x20) == 'e') { 
 761         goto inExponentIndicator
; 
 766     if (m_current 
== '+' || m_current 
== '-') { 
 770     if (!isASCIIDigit(m_current
)) 
 775     } while (isASCIIDigit(m_current
)); 
 782     } while (isASCIIOctalDigit(m_current
)); 
 783     if (isASCIIDigit(m_current
)) 
 788     const char* end 
= m_buffer8
.end(); 
 789     for (const char* p 
= m_buffer8
.data(); p 
< end
; ++p
) { 
 793     if (dval 
>= mantissaOverflowLowerBound
) 
 794         dval 
= parseIntOverflow(m_buffer8
.data(), end 
- m_buffer8
.data(), 8); 
 798     lvalp
->doubleValue 
= dval
; 
 806     } while (isASCIIHexDigit(m_current
)); 
 810     const char* end 
= m_buffer8
.end(); 
 811     for (const char* p 
= m_buffer8
.data(); p 
< end
; ++p
) { 
 813         dval 
+= toASCIIHexValue(*p
); 
 815     if (dval 
>= mantissaOverflowLowerBound
) 
 816         dval 
= parseIntOverflow(m_buffer8
.data(), end 
- m_buffer8
.data(), 16); 
 820     lvalp
->doubleValue 
= dval
; 
 827     while (isASCIIDigit(m_current
)) { 
 831     if (m_current 
== '.') { 
 834         goto inNumberAfterDecimalPoint
; 
 836     if ((m_current 
| 0x20) == 'e') { 
 839         goto inExponentIndicator
; 
 842     // Fall through into doneNumber. 
 845     // Null-terminate string for strtod. 
 846     m_buffer8
.append('\0'); 
 847     lvalp
->doubleValue 
= WTF::strtod(m_buffer8
.data(), 0); 
 850     // Fall through into doneNumeric. 
 853     // No identifiers allowed directly after numeric literal, e.g. "3in" is bad. 
 854     if (UNLIKELY(isIdentStart(m_current
))) 
 857     m_atLineStart 
= false; 
 868     m_atLineStart 
= false; 
 870     lvalp
->ident 
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size()); 
 871     m_buffer16
.resize(0); 
 875 doneIdentifierOrKeyword
: { 
 876     m_atLineStart 
= false; 
 878     m_buffer16
.resize(0); 
 879     const HashEntry
* entry 
= m_keywordTable
.entry(m_globalData
, *lvalp
->ident
); 
 880     token 
= entry 
? entry
->lexerValue() : IDENT
; 
 885     // Atomize constant strings in case they're later used in property lookup. 
 887     m_atLineStart 
= false; 
 889     lvalp
->ident 
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size()); 
 890     m_buffer16
.resize(0); 
 893     // Fall through into returnToken. 
 896     int lineNumber 
= m_lineNumber
; 
 897     llocp
->first_line 
= lineNumber
; 
 898     llocp
->last_line 
= lineNumber
; 
 899     llocp
->first_column 
= startOffset
; 
 900     llocp
->last_column 
= currentOffset(); 
 911 bool Lexer::scanRegExp() 
 913     ASSERT(m_buffer16
.isEmpty()); 
 915     bool lastWasEscape 
= false; 
 916     bool inBrackets 
= false; 
 919         if (isLineTerminator(m_current
) || m_current 
== -1) 
 921         if (m_current 
!= '/' || lastWasEscape 
|| inBrackets
) { 
 922             // keep track of '[' and ']' 
 923             if (!lastWasEscape
) { 
 924                 if (m_current 
== '[' && !inBrackets
) 
 926                 if (m_current 
== ']' && inBrackets
) 
 930             lastWasEscape 
= !lastWasEscape 
&& m_current 
== '\\'; 
 931         } else { // end of regexp 
 932             m_pattern 
= UString(m_buffer16
); 
 933             m_buffer16
.resize(0); 
 940     while (isIdentPart(m_current
)) { 
 944     m_flags 
= UString(m_buffer16
); 
 945     m_buffer16
.resize(0); 
 952     m_identifiers
.clear(); 
 953     m_codeWithoutBOMs
.clear(); 
 955     Vector
<char> newBuffer8
; 
 956     newBuffer8
.reserveInitialCapacity(initialReadBufferCapacity
); 
 957     m_buffer8
.swap(newBuffer8
); 
 959     Vector
<UChar
> newBuffer16
; 
 960     newBuffer16
.reserveInitialCapacity(initialReadBufferCapacity
); 
 961     m_buffer16
.swap(newBuffer16
); 
 963     m_isReparsing 
= false; 
 965     m_pattern 
= UString(); 
 969 SourceCode 
Lexer::sourceCode(int openBrace
, int closeBrace
, int firstLine
) 
 971     if (m_codeWithoutBOMs
.isEmpty()) 
 972         return SourceCode(m_source
->provider(), openBrace
, closeBrace 
+ 1, firstLine
); 
 974     const UChar
* data 
= m_source
->provider()->data(); 
 976     ASSERT(openBrace 
< closeBrace
); 
 978     int numBOMsBeforeOpenBrace 
= 0; 
 979     int numBOMsBetweenBraces 
= 0; 
 982     for (i 
= m_source
->startOffset(); i 
< openBrace
; ++i
) 
 983         numBOMsBeforeOpenBrace 
+= data
[i
] == byteOrderMark
; 
 984     for (; i 
< closeBrace
; ++i
) 
 985         numBOMsBetweenBraces 
+= data
[i
] == byteOrderMark
; 
 987     return SourceCode(m_source
->provider(), openBrace 
+ numBOMsBeforeOpenBrace
, 
 988         closeBrace 
+ numBOMsBeforeOpenBrace 
+ numBOMsBetweenBraces 
+ 1, firstLine
);