2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org) 
   3  *  Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved. 
   4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) 
   6  *  This library is free software; you can redistribute it and/or 
   7  *  modify it under the terms of the GNU Library General Public 
   8  *  License as published by the Free Software Foundation; either 
   9  *  version 2 of the License, or (at your option) any later version. 
  11  *  This library is distributed in the hope that it will be useful, 
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
  14  *  Library General Public License for more details. 
  16  *  You should have received a copy of the GNU Library General Public License 
  17  *  along with this library; see the file COPYING.LIB.  If not, write to 
  18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 
  19  *  Boston, MA 02110-1301, USA. 
  26 #include "JSFunction.h" 
  27 #include "JSGlobalObjectFunctions.h" 
  34 #include <wtf/Assertions.h> 
  37 using namespace Unicode
; 
  39 // We can't specify the namespace in yacc's C output, so do it here instead. 
  44 #include "Lexer.lut.h" 
  48 static const UChar byteOrderMark 
= 0xFEFF; 
  50 Lexer::Lexer(JSGlobalData
* globalData
) 
  51     : m_isReparsing(false) 
  52     , m_globalData(globalData
) 
  53     , m_keywordTable(JSC::mainTable
) 
  55     m_buffer8
.reserveInitialCapacity(initialReadBufferCapacity
); 
  56     m_buffer16
.reserveInitialCapacity(initialReadBufferCapacity
); 
  61     m_keywordTable
.deleteTable(); 
  64 inline const UChar
* Lexer::currentCharacter() const 
  69 inline int Lexer::currentOffset() const 
  71     return currentCharacter() - m_codeStart
; 
  74 ALWAYS_INLINE 
void Lexer::shift1() 
  79     if (LIKELY(m_code 
< m_codeEnd
)) 
  87 ALWAYS_INLINE 
void Lexer::shift2() 
  91     if (LIKELY(m_code 
+ 1 < m_codeEnd
)) { 
  95         m_next2 
= m_code 
< m_codeEnd 
? m_code
[0] : -1; 
 102 ALWAYS_INLINE 
void Lexer::shift3() 
 105     if (LIKELY(m_code 
+ 2 < m_codeEnd
)) { 
 110         m_next1 
= m_code 
< m_codeEnd 
? m_code
[0] : -1; 
 111         m_next2 
= m_code 
+ 1 < m_codeEnd 
? m_code
[1] : -1; 
 118 ALWAYS_INLINE 
void Lexer::shift4() 
 120     if (LIKELY(m_code 
+ 3 < m_codeEnd
)) { 
 121         m_current 
= m_code
[0]; 
 126         m_current 
= m_code 
< m_codeEnd 
? m_code
[0] : -1; 
 127         m_next1 
= m_code 
+ 1 < m_codeEnd 
? m_code
[1] : -1; 
 128         m_next2 
= m_code 
+ 2 < m_codeEnd 
? m_code
[2] : -1; 
 135 void Lexer::setCode(const SourceCode
& source
, ParserArena
& arena
) 
 137     m_arena 
= &arena
.identifierArena(); 
 139     m_lineNumber 
= source
.firstLine(); 
 143     const UChar
* data 
= source
.provider()->data(); 
 147     m_code 
= data 
+ source
.startOffset(); 
 148     m_codeEnd 
= data 
+ source
.endOffset(); 
 150     m_atLineStart 
= true; 
 152     // ECMA-262 calls for stripping all Cf characters, but we only strip BOM characters. 
 153     // See <https://bugs.webkit.org/show_bug.cgi?id=4931> for details. 
 154     if (source
.provider()->hasBOMs()) { 
 155         for (const UChar
* p 
= m_codeStart
; p 
< m_codeEnd
; ++p
) { 
 156             if (UNLIKELY(*p 
== byteOrderMark
)) { 
 157                 copyCodeWithoutBOMs(); 
 163     // Read the first characters into the 4-character buffer. 
 165     ASSERT(currentOffset() == source
.startOffset()); 
 168 void Lexer::copyCodeWithoutBOMs() 
 170     // Note: In this case, the character offset data for debugging will be incorrect. 
 171     // If it's important to correctly debug code with extraneous BOMs, then the caller 
 172     // should strip the BOMs when creating the SourceProvider object and do its own 
 173     // mapping of offsets within the stripped text to original text offset. 
 175     m_codeWithoutBOMs
.reserveCapacity(m_codeEnd 
- m_code
); 
 176     for (const UChar
* p 
= m_code
; p 
< m_codeEnd
; ++p
) { 
 178         if (c 
!= byteOrderMark
) 
 179             m_codeWithoutBOMs
.append(c
); 
 181     ptrdiff_t startDelta 
= m_codeStart 
- m_code
; 
 182     m_code 
= m_codeWithoutBOMs
.data(); 
 183     m_codeStart 
= m_code 
+ startDelta
; 
 184     m_codeEnd 
= m_codeWithoutBOMs
.data() + m_codeWithoutBOMs
.size(); 
 187 void Lexer::shiftLineTerminator() 
 189     ASSERT(isLineTerminator(m_current
)); 
 191     // Allow both CRLF and LFCR. 
 192     if (m_current 
+ m_next1 
== '\n' + '\r') 
 200 ALWAYS_INLINE 
const Identifier
* Lexer::makeIdentifier(const UChar
* characters
, size_t length
) 
 202     return &m_arena
->makeIdentifier(m_globalData
, characters
, length
); 
 205 inline bool Lexer::lastTokenWasRestrKeyword() const 
 207     return m_lastToken 
== CONTINUE 
|| m_lastToken 
== BREAK 
|| m_lastToken 
== RETURN 
|| m_lastToken 
== THROW
; 
 210 static NEVER_INLINE 
bool isNonASCIIIdentStart(int c
) 
 212     return category(c
) & (Letter_Uppercase 
| Letter_Lowercase 
| Letter_Titlecase 
| Letter_Modifier 
| Letter_Other
); 
 215 static inline bool isIdentStart(int c
) 
 217     return isASCII(c
) ? isASCIIAlpha(c
) || c 
== '$' || c 
== '_' : isNonASCIIIdentStart(c
); 
 220 static NEVER_INLINE 
bool isNonASCIIIdentPart(int c
) 
 222     return category(c
) & (Letter_Uppercase 
| Letter_Lowercase 
| Letter_Titlecase 
| Letter_Modifier 
| Letter_Other
 
 223         | Mark_NonSpacing 
| Mark_SpacingCombining 
| Number_DecimalDigit 
| Punctuation_Connector
); 
 226 static inline bool isIdentPart(int c
) 
 228     return isASCII(c
) ? isASCIIAlphanumeric(c
) || c 
== '$' || c 
== '_' : isNonASCIIIdentPart(c
); 
 231 static inline int singleEscape(int c
) 
 251 inline void Lexer::record8(int c
) 
 255     m_buffer8
.append(static_cast<char>(c
)); 
 258 inline void Lexer::record16(UChar c
) 
 260     m_buffer16
.append(c
); 
 263 inline void Lexer::record16(int c
) 
 266     ASSERT(c 
<= USHRT_MAX
); 
 267     record16(UChar(static_cast<unsigned short>(c
))); 
 270 int Lexer::lex(void* p1
, void* p2
) 
 273     ASSERT(m_buffer8
.isEmpty()); 
 274     ASSERT(m_buffer16
.isEmpty()); 
 276     YYSTYPE
* lvalp 
= static_cast<YYSTYPE
*>(p1
); 
 277     YYLTYPE
* llocp 
= static_cast<YYLTYPE
*>(p2
); 
 279     m_terminator 
= false; 
 282     while (isWhiteSpace(m_current
)) 
 285     int startOffset 
= currentOffset(); 
 287     if (m_current 
== -1) { 
 288         if (!m_terminator 
&& !m_delimited 
&& !m_isReparsing
) { 
 289             // automatic semicolon insertion if program incomplete 
 299             if (m_next1 
== '>' && m_next2 
== '>') { 
 300                 if (m_next3 
== '=') { 
 302                     token 
= URSHIFTEQUAL
; 
 309             if (m_next1 
== '>') { 
 310                 if (m_next2 
== '=') { 
 319             if (m_next1 
== '=') { 
 328             if (m_next1 
== '=') { 
 329                 if (m_next2 
== '=') { 
 342             if (m_next1 
== '=') { 
 343                 if (m_next2 
== '=') { 
 356             if (m_next1 
== '!' && m_next2 
== '-' && m_next3 
== '-') { 
 357                 // <!-- marks the beginning of a line comment (for www usage) 
 359                 goto inSingleLineComment
; 
 361             if (m_next1 
== '<') { 
 362                 if (m_next2 
== '=') { 
 371             if (m_next1 
== '=') { 
 380             if (m_next1 
== '+') { 
 383                     token 
= AUTOPLUSPLUS
; 
 389             if (m_next1 
== '=') { 
 398             if (m_next1 
== '-') { 
 399                 if (m_atLineStart 
&& m_next2 
== '>') { 
 401                     goto inSingleLineComment
; 
 405                     token 
= AUTOMINUSMINUS
; 
 411             if (m_next1 
== '=') { 
 420             if (m_next1 
== '=') { 
 429             if (m_next1 
== '/') { 
 431                 goto inSingleLineComment
; 
 434                 goto inMultiLineComment
; 
 435             if (m_next1 
== '=') { 
 444             if (m_next1 
== '&') { 
 449             if (m_next1 
== '=') { 
 458             if (m_next1 
== '=') { 
 467             if (m_next1 
== '=') { 
 476             if (m_next1 
== '=') { 
 481             if (m_next1 
== '|') { 
 490             if (isASCIIDigit(m_next1
)) { 
 493                 goto inNumberAfterDecimalPoint
; 
 515             lvalp
->intValue 
= currentOffset(); 
 520             lvalp
->intValue 
= currentOffset(); 
 526             goto startIdentifierWithBackslash
; 
 528             goto startNumberWithZeroDigit
; 
 543             if (isIdentStart(m_current
)) 
 544                 goto startIdentifierOrKeyword
; 
 545             if (isLineTerminator(m_current
)) { 
 546                 shiftLineTerminator(); 
 547                 m_atLineStart 
= true; 
 549                 if (lastTokenWasRestrKeyword()) { 
 558     m_atLineStart 
= false; 
 562     int stringQuoteCharacter 
= m_current
; 
 565     const UChar
* stringStart 
= currentCharacter(); 
 566     while (m_current 
!= stringQuoteCharacter
) { 
 567         // Fast check for characters that require special handling. 
 568         // Catches -1, \n, \r, \, 0x2028, and 0x2029 as efficiently 
 569         // as possible, and lets through all common ASCII characters. 
 570         if (UNLIKELY(m_current 
== '\\') || UNLIKELY(((static_cast<unsigned>(m_current
) - 0xE) & 0x2000))) { 
 571             m_buffer16
.append(stringStart
, currentCharacter() - stringStart
); 
 576     lvalp
->ident 
= makeIdentifier(stringStart
, currentCharacter() - stringStart
); 
 578     m_atLineStart 
= false; 
 584     while (m_current 
!= stringQuoteCharacter
) { 
 585         if (m_current 
== '\\') 
 586             goto inStringEscapeSequence
; 
 587         if (UNLIKELY(isLineTerminator(m_current
))) 
 589         if (UNLIKELY(m_current 
== -1)) 
 596 inStringEscapeSequence
: 
 598     if (m_current 
== 'x') { 
 600         if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
)) { 
 601             record16(convertHex(m_current
, m_next1
)); 
 606         if (m_current 
== stringQuoteCharacter
) 
 610     if (m_current 
== 'u') { 
 612         if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
) && isASCIIHexDigit(m_next2
) && isASCIIHexDigit(m_next3
)) { 
 613             record16(convertUnicode(m_current
, m_next1
, m_next2
, m_next3
)); 
 617         if (m_current 
== stringQuoteCharacter
) { 
 623     if (isASCIIOctalDigit(m_current
)) { 
 624         if (m_current 
>= '0' && m_current 
<= '3' && isASCIIOctalDigit(m_next1
) && isASCIIOctalDigit(m_next2
)) { 
 625             record16((m_current 
- '0') * 64 + (m_next1 
- '0') * 8 + m_next2 
- '0'); 
 629         if (isASCIIOctalDigit(m_next1
)) { 
 630             record16((m_current 
- '0') * 8 + m_next1 
- '0'); 
 634         record16(m_current 
- '0'); 
 638     if (isLineTerminator(m_current
)) { 
 639         shiftLineTerminator(); 
 644     record16(singleEscape(m_current
)); 
 649 startIdentifierWithBackslash
: 
 651     if (UNLIKELY(m_current 
!= 'u')) 
 654     if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
))) 
 656     token 
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
); 
 657     if (UNLIKELY(!isIdentStart(token
))) 
 659     goto inIdentifierAfterCharacterCheck
; 
 661 startIdentifierOrKeyword
: { 
 662     const UChar
* identifierStart 
= currentCharacter(); 
 664     while (isIdentPart(m_current
)) 
 666     if (LIKELY(m_current 
!= '\\')) { 
 667         lvalp
->ident 
= makeIdentifier(identifierStart
, currentCharacter() - identifierStart
); 
 668         goto doneIdentifierOrKeyword
; 
 670     m_buffer16
.append(identifierStart
, currentCharacter() - identifierStart
); 
 675         if (UNLIKELY(m_current 
!= 'u')) 
 678         if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
))) 
 680         token 
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
); 
 681         if (UNLIKELY(!isIdentPart(token
))) 
 683 inIdentifierAfterCharacterCheck
: 
 687         while (isIdentPart(m_current
)) { 
 691     } while (UNLIKELY(m_current 
== '\\')); 
 695     while (!isLineTerminator(m_current
)) { 
 696         if (UNLIKELY(m_current 
== -1)) 
 700     shiftLineTerminator(); 
 701     m_atLineStart 
= true; 
 703     if (lastTokenWasRestrKeyword()) 
 709     while (m_current 
!= '*' || m_next1 
!= '/') { 
 710         if (isLineTerminator(m_current
)) 
 711             shiftLineTerminator(); 
 714             if (UNLIKELY(m_current 
== -1)) 
 719     m_atLineStart 
= false; 
 722 startNumberWithZeroDigit
: 
 724     if ((m_current 
| 0x20) == 'x' && isASCIIHexDigit(m_next1
)) { 
 728     if (m_current 
== '.') { 
 732         goto inNumberAfterDecimalPoint
; 
 734     if ((m_current 
| 0x20) == 'e') { 
 738         goto inExponentIndicator
; 
 740     if (isASCIIOctalDigit(m_current
)) 
 742     if (isASCIIDigit(m_current
)) 
 744     lvalp
->doubleValue 
= 0; 
 747 inNumberAfterDecimalPoint
: 
 748     while (isASCIIDigit(m_current
)) { 
 752     if ((m_current 
| 0x20) == 'e') { 
 755         goto inExponentIndicator
; 
 760     if (m_current 
== '+' || m_current 
== '-') { 
 764     if (!isASCIIDigit(m_current
)) 
 769     } while (isASCIIDigit(m_current
)); 
 776     } while (isASCIIOctalDigit(m_current
)); 
 777     if (isASCIIDigit(m_current
)) 
 782     const char* end 
= m_buffer8
.end(); 
 783     for (const char* p 
= m_buffer8
.data(); p 
< end
; ++p
) { 
 787     if (dval 
>= mantissaOverflowLowerBound
) 
 788         dval 
= parseIntOverflow(m_buffer8
.data(), end 
- m_buffer8
.data(), 8); 
 792     lvalp
->doubleValue 
= dval
; 
 800     } while (isASCIIHexDigit(m_current
)); 
 804     const char* end 
= m_buffer8
.end(); 
 805     for (const char* p 
= m_buffer8
.data(); p 
< end
; ++p
) { 
 807         dval 
+= toASCIIHexValue(*p
); 
 809     if (dval 
>= mantissaOverflowLowerBound
) 
 810         dval 
= parseIntOverflow(m_buffer8
.data(), end 
- m_buffer8
.data(), 16); 
 814     lvalp
->doubleValue 
= dval
; 
 821     while (isASCIIDigit(m_current
)) { 
 825     if (m_current 
== '.') { 
 828         goto inNumberAfterDecimalPoint
; 
 830     if ((m_current 
| 0x20) == 'e') { 
 833         goto inExponentIndicator
; 
 836     // Fall through into doneNumber. 
 839     // Null-terminate string for strtod. 
 840     m_buffer8
.append('\0'); 
 841     lvalp
->doubleValue 
= WTF::strtod(m_buffer8
.data(), 0); 
 844     // Fall through into doneNumeric. 
 847     // No identifiers allowed directly after numeric literal, e.g. "3in" is bad. 
 848     if (UNLIKELY(isIdentStart(m_current
))) 
 851     m_atLineStart 
= false; 
 862     m_atLineStart 
= false; 
 864     lvalp
->ident 
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size()); 
 865     m_buffer16
.resize(0); 
 869 doneIdentifierOrKeyword
: { 
 870     m_atLineStart 
= false; 
 872     m_buffer16
.resize(0); 
 873     const HashEntry
* entry 
= m_keywordTable
.entry(m_globalData
, *lvalp
->ident
); 
 874     token 
= entry 
? entry
->lexerValue() : IDENT
; 
 879     // Atomize constant strings in case they're later used in property lookup. 
 881     m_atLineStart 
= false; 
 883     lvalp
->ident 
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size()); 
 884     m_buffer16
.resize(0); 
 887     // Fall through into returnToken. 
 890     int lineNumber 
= m_lineNumber
; 
 891     llocp
->first_line 
= lineNumber
; 
 892     llocp
->last_line 
= lineNumber
; 
 893     llocp
->first_column 
= startOffset
; 
 894     llocp
->last_column 
= currentOffset(); 
 905 bool Lexer::scanRegExp(const Identifier
*& pattern
, const Identifier
*& flags
, UChar patternPrefix
) 
 907     ASSERT(m_buffer16
.isEmpty()); 
 909     bool lastWasEscape 
= false; 
 910     bool inBrackets 
= false; 
 913         ASSERT(!isLineTerminator(patternPrefix
)); 
 914         ASSERT(patternPrefix 
!= '/'); 
 915         ASSERT(patternPrefix 
!= '['); 
 916         record16(patternPrefix
); 
 920         int current 
= m_current
; 
 922         if (isLineTerminator(current
) || current 
== -1) { 
 923             m_buffer16
.resize(0); 
 929         if (current 
== '/' && !lastWasEscape 
&& !inBrackets
) 
 935             lastWasEscape 
= false; 
 947             lastWasEscape 
= true; 
 952     pattern 
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size()); 
 953     m_buffer16
.resize(0); 
 955     while (isIdentPart(m_current
)) { 
 960     flags 
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size()); 
 961     m_buffer16
.resize(0); 
 966 bool Lexer::skipRegExp() 
 968     bool lastWasEscape 
= false; 
 969     bool inBrackets 
= false; 
 972         int current 
= m_current
; 
 974         if (isLineTerminator(current
) || current 
== -1) 
 979         if (current 
== '/' && !lastWasEscape 
&& !inBrackets
) 
 983             lastWasEscape 
= false; 
 995             lastWasEscape 
= true; 
1000     while (isIdentPart(m_current
)) 
1009     m_codeWithoutBOMs
.clear(); 
1011     Vector
<char> newBuffer8
; 
1012     newBuffer8
.reserveInitialCapacity(initialReadBufferCapacity
); 
1013     m_buffer8
.swap(newBuffer8
); 
1015     Vector
<UChar
> newBuffer16
; 
1016     newBuffer16
.reserveInitialCapacity(initialReadBufferCapacity
); 
1017     m_buffer16
.swap(newBuffer16
); 
1019     m_isReparsing 
= false; 
1022 SourceCode 
Lexer::sourceCode(int openBrace
, int closeBrace
, int firstLine
) 
1024     if (m_codeWithoutBOMs
.isEmpty()) 
1025         return SourceCode(m_source
->provider(), openBrace
, closeBrace 
+ 1, firstLine
); 
1027     const UChar
* data 
= m_source
->provider()->data(); 
1029     ASSERT(openBrace 
< closeBrace
); 
1031     for (i 
= m_source
->startOffset(); i 
< openBrace
; ++i
) { 
1032         if (data
[i
] == byteOrderMark
) { 
1037     for (; i 
< closeBrace
; ++i
) { 
1038         if (data
[i
] == byteOrderMark
) 
1042     ASSERT(openBrace 
< closeBrace
); 
1044     return SourceCode(m_source
->provider(), openBrace
, closeBrace 
+ 1, firstLine
);