2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org) 
   3  *  Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved. 
   4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) 
   6  *  This library is free software; you can redistribute it and/or 
   7  *  modify it under the terms of the GNU Library General Public 
   8  *  License as published by the Free Software Foundation; either 
   9  *  version 2 of the License, or (at your option) any later version. 
  11  *  This library is distributed in the hope that it will be useful, 
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
  14  *  Library General Public License for more details. 
  16  *  You should have received a copy of the GNU Library General Public License 
  17  *  along with this library; see the file COPYING.LIB.  If not, write to 
  18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 
  19  *  Boston, MA 02110-1301, USA. 
  26 #include "JSFunction.h" 
  27 #include "JSGlobalObjectFunctions.h" 
  34 #include <wtf/ASCIICType.h> 
  35 #include <wtf/Assertions.h> 
  36 #include <wtf/unicode/Unicode.h> 
  39 using namespace Unicode
; 
  41 // we can't specify the namespace in yacc's C output, so do it here 
  49 #include "Lexer.lut.h" 
  51 // a bridge for yacc from the C world to C++ 
  52 int jscyylex(void* lvalp
, void* llocp
, void* globalData
) 
  54     return static_cast<JSGlobalData
*>(globalData
)->lexer
->lex(lvalp
, llocp
); 
  59 static bool isDecimalDigit(int); 
  61 Lexer::Lexer(JSGlobalData
* globalData
) 
  63     , m_restrKeyword(false) 
  64     , m_eatNextIdentifier(false) 
  70     , m_isReparsing(false) 
  80     , m_globalData(globalData
) 
  81     , m_mainTable(JSC::mainTable
) 
  83     m_buffer8
.reserveInitialCapacity(initialReadBufferCapacity
); 
  84     m_buffer16
.reserveInitialCapacity(initialReadBufferCapacity
); 
  89     m_mainTable
.deleteTable(); 
  92 void Lexer::setCode(const SourceCode
& source
) 
  94     yylineno 
= source
.firstLine(); 
  95     m_restrKeyword 
= false; 
  97     m_eatNextIdentifier 
= false; 
 101     m_position 
= source
.startOffset(); 
 103     m_code 
= source
.provider()->data(); 
 104     m_length 
= source
.endOffset(); 
 108     m_atLineStart 
= true; 
 110     // read first characters 
 114 void Lexer::shift(unsigned p
) 
 116     // ECMA-262 calls for stripping Cf characters here, but we only do this for BOM, 
 117     // see <https://bugs.webkit.org/show_bug.cgi?id=4931>. 
 123         m_currentOffset 
= m_nextOffset1
; 
 124         m_nextOffset1 
= m_nextOffset2
; 
 125         m_nextOffset2 
= m_nextOffset3
; 
 127             if (m_position 
>= m_length
) { 
 128                 m_nextOffset3 
= m_position
; 
 133             m_nextOffset3 
= m_position
; 
 134             m_next3 
= m_code
[m_position
++]; 
 135         } while (m_next3 
== 0xFEFF); 
 139 // called on each new line 
 140 void Lexer::nextLine() 
 143     m_atLineStart 
= true; 
 146 void Lexer::setDone(State s
) 
 152 int Lexer::lex(void* p1
, void* p2
) 
 154     YYSTYPE
* lvalp 
= static_cast<YYSTYPE
*>(p1
); 
 155     YYLTYPE
* llocp 
= static_cast<YYLTYPE
*>(p2
); 
 158     unsigned short stringType 
= 0; // either single or double quotes 
 162     m_terminator 
= false; 
 166     // did we push a token on the stack previously ? 
 167     // (after an automatic semicolon insertion) 
 168     if (m_stackToken 
>= 0) { 
 170         token 
= m_stackToken
; 
 173     int startOffset 
= m_currentOffset
; 
 175         if (m_skipLF 
&& m_current 
!= '\n') // found \r but not \n afterwards 
 177         if (m_skipCR 
&& m_current 
!= '\r') // found \n but not \r afterwards 
 179         if (m_skipLF 
|| m_skipCR
) { // found \r\n or \n\r -> eat the second one 
 186                 startOffset 
= m_currentOffset
; 
 187                 if (isWhiteSpace()) { 
 189                 } else if (m_current 
== '/' && m_next1 
== '/') { 
 191                     m_state 
= InSingleLineComment
; 
 192                 } else if (m_current 
== '/' && m_next1 
== '*') { 
 194                     m_state 
= InMultiLineComment
; 
 195                 } else if (m_current 
== -1) { 
 196                     if (!m_terminator 
&& !m_delimited 
&& !m_isReparsing
) { 
 197                         // automatic semicolon insertion if program incomplete 
 203                 } else if (isLineTerminator()) { 
 206                     if (m_restrKeyword
) { 
 210                 } else if (m_current 
== '"' || m_current 
== '\'') { 
 212                     stringType 
= static_cast<unsigned short>(m_current
); 
 213                 } else if (isIdentStart(m_current
)) { 
 215                     m_state 
= InIdentifierOrKeyword
; 
 216                 } else if (m_current 
== '\\') 
 217                     m_state 
= InIdentifierStartUnicodeEscapeStart
; 
 218                 else if (m_current 
== '0') { 
 221                 } else if (isDecimalDigit(m_current
)) { 
 224                 } else if (m_current 
== '.' && isDecimalDigit(m_next1
)) { 
 227                     // <!-- marks the beginning of a line comment (for www usage) 
 228                 } else if (m_current 
== '<' && m_next1 
== '!' && m_next2 
== '-' && m_next3 
== '-') { 
 230                     m_state 
= InSingleLineComment
; 
 232                 } else if (m_atLineStart 
&& m_current 
== '-' && m_next1 
== '-' &&  m_next2 
== '>') { 
 234                     m_state 
= InSingleLineComment
; 
 236                     token 
= matchPunctuator(lvalp
->intValue
, m_current
, m_next1
, m_next2
, m_next3
); 
 244                 if (m_current 
== stringType
) { 
 247                 } else if (isLineTerminator() || m_current 
== -1) 
 249                 else if (m_current 
== '\\') 
 250                     m_state 
= InEscapeSequence
; 
 254             // Escape Sequences inside of strings 
 255             case InEscapeSequence
: 
 256                 if (isOctalDigit(m_current
)) { 
 257                     if (m_current 
>= '0' && m_current 
<= '3' && 
 258                         isOctalDigit(m_next1
) && isOctalDigit(m_next2
)) { 
 259                         record16(convertOctal(m_current
, m_next1
, m_next2
)); 
 262                     } else if (isOctalDigit(m_current
) && isOctalDigit(m_next1
)) { 
 263                         record16(convertOctal('0', m_current
, m_next1
)); 
 266                     } else if (isOctalDigit(m_current
)) { 
 267                         record16(convertOctal('0', '0', m_current
)); 
 271                 } else if (m_current 
== 'x') 
 272                     m_state 
= InHexEscape
; 
 273                 else if (m_current 
== 'u') 
 274                     m_state 
= InUnicodeEscape
; 
 275                 else if (isLineTerminator()) { 
 279                     record16(singleEscape(static_cast<unsigned short>(m_current
))); 
 284                 if (isHexDigit(m_current
) && isHexDigit(m_next1
)) { 
 286                     record16(convertHex(m_current
, m_next1
)); 
 288                 } else if (m_current 
== stringType
) { 
 298             case InUnicodeEscape
: 
 299                 if (isHexDigit(m_current
) && isHexDigit(m_next1
) && isHexDigit(m_next2
) && isHexDigit(m_next3
)) { 
 300                     record16(convertUnicode(m_current
, m_next1
, m_next2
, m_next3
)); 
 303                 } else if (m_current 
== stringType
) { 
 310             case InSingleLineComment
: 
 311                 if (isLineTerminator()) { 
 314                     if (m_restrKeyword
) { 
 319                 } else if (m_current 
== -1) 
 322             case InMultiLineComment
: 
 325                 else if (isLineTerminator()) 
 327                 else if (m_current 
== '*' && m_next1 
== '/') { 
 332             case InIdentifierOrKeyword
: 
 334                 if (isIdentPart(m_current
)) 
 336                 else if (m_current 
== '\\') 
 337                     m_state 
= InIdentifierPartUnicodeEscapeStart
; 
 339                     setDone(m_state 
== InIdentifierOrKeyword 
? IdentifierOrKeyword 
: Identifier
); 
 342                 if (m_current 
== 'x' || m_current 
== 'X') { 
 345                 } else if (m_current 
== '.') { 
 348                 } else if (m_current 
== 'e' || m_current 
== 'E') { 
 350                     m_state 
= InExponentIndicator
; 
 351                 } else if (isOctalDigit(m_current
)) { 
 354                 } else if (isDecimalDigit(m_current
)) { 
 361                 if (isHexDigit(m_current
)) 
 367                 if (isOctalDigit(m_current
)) 
 369                 else if (isDecimalDigit(m_current
)) { 
 376                 if (isDecimalDigit(m_current
)) 
 378                 else if (m_current 
== '.') { 
 381                 } else if (m_current 
== 'e' || m_current 
== 'E') { 
 383                     m_state 
= InExponentIndicator
; 
 388                 if (isDecimalDigit(m_current
)) 
 390                 else if (m_current 
== 'e' || m_current 
== 'E') { 
 392                     m_state 
= InExponentIndicator
; 
 396             case InExponentIndicator
: 
 397                 if (m_current 
== '+' || m_current 
== '-') 
 399                 else if (isDecimalDigit(m_current
)) { 
 401                     m_state 
= InExponent
; 
 406                 if (isDecimalDigit(m_current
)) 
 411             case InIdentifierStartUnicodeEscapeStart
: 
 412                 if (m_current 
== 'u') 
 413                     m_state 
= InIdentifierStartUnicodeEscape
; 
 417             case InIdentifierPartUnicodeEscapeStart
: 
 418                 if (m_current 
== 'u') 
 419                     m_state 
= InIdentifierPartUnicodeEscape
; 
 423             case InIdentifierStartUnicodeEscape
: 
 424                 if (!isHexDigit(m_current
) || !isHexDigit(m_next1
) || !isHexDigit(m_next2
) || !isHexDigit(m_next3
)) { 
 428                 token 
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
); 
 430                 if (!isIdentStart(token
)) { 
 435                 m_state 
= InIdentifier
; 
 437             case InIdentifierPartUnicodeEscape
: 
 438                 if (!isHexDigit(m_current
) || !isHexDigit(m_next1
) || !isHexDigit(m_next2
) || !isHexDigit(m_next3
)) { 
 442                 token 
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
); 
 444                 if (!isIdentPart(token
)) { 
 449                 m_state 
= InIdentifier
; 
 452                 ASSERT(!"Unhandled state in switch statement"); 
 455         // move on to the next character 
 458         if (m_state 
!= Start 
&& m_state 
!= InSingleLineComment
) 
 459             m_atLineStart 
= false; 
 462     // no identifiers allowed directly after numeric literal, e.g. "3in" is bad 
 463     if ((m_state 
== Number 
|| m_state 
== Octal 
|| m_state 
== Hex
) && isIdentStart(m_current
)) 
 467     m_buffer8
.append('\0'); 
 470     fprintf(stderr
, "line: %d ", lineNo()); 
 471     fprintf(stderr
, "yytext (%x): ", m_buffer8
[0]); 
 472     fprintf(stderr
, "%s ", m_buffer8
.data()); 
 476     if (m_state 
== Number
) 
 477         dval 
= WTF::strtod(m_buffer8
.data(), 0L); 
 478     else if (m_state 
== Hex
) { // scan hex numbers 
 479         const char* p 
= m_buffer8
.data() + 2; 
 480         while (char c 
= *p
++) { 
 482             dval 
+= convertHex(c
); 
 485         if (dval 
>= mantissaOverflowLowerBound
) 
 486             dval 
= parseIntOverflow(m_buffer8
.data() + 2, p 
- (m_buffer8
.data() + 3), 16); 
 489     } else if (m_state 
== Octal
) {   // scan octal number 
 490         const char* p 
= m_buffer8
.data() + 1; 
 491         while (char c 
= *p
++) { 
 496         if (dval 
>= mantissaOverflowLowerBound
) 
 497             dval 
= parseIntOverflow(m_buffer8
.data() + 1, p 
- (m_buffer8
.data() + 2), 8); 
 511             printf("(Identifier)/(Keyword)\n"); 
 514             printf("(String)\n"); 
 517             printf("(Number)\n"); 
 524     if (m_state 
!= Identifier
) 
 525         m_eatNextIdentifier 
= false; 
 527     m_restrKeyword 
= false; 
 529     llocp
->first_line 
= yylineno
; 
 530     llocp
->last_line 
= yylineno
; 
 531     llocp
->first_column 
= startOffset
; 
 532     llocp
->last_column 
= m_currentOffset
; 
 538             if (token 
== '}' || token 
== ';') 
 542             // Apply anonymous-function hack below (eat the identifier). 
 543             if (m_eatNextIdentifier
) { 
 544                 m_eatNextIdentifier 
= false; 
 545                 token 
= lex(lvalp
, llocp
); 
 548             lvalp
->ident 
= makeIdentifier(m_buffer16
); 
 551         case IdentifierOrKeyword
: { 
 552             lvalp
->ident 
= makeIdentifier(m_buffer16
); 
 553             const HashEntry
* entry 
= m_mainTable
.entry(m_globalData
, *lvalp
->ident
); 
 555                 // Lookup for keyword failed, means this is an identifier. 
 559             token 
= entry
->lexerValue(); 
 560             // Hack for "f = function somename() { ... }"; too hard to get into the grammar. 
 561             m_eatNextIdentifier 
= token 
== FUNCTION 
&& m_lastToken 
== '='; 
 562             if (token 
== CONTINUE 
|| token 
== BREAK 
|| token 
== RETURN 
|| token 
== THROW
) 
 563                 m_restrKeyword 
= true; 
 567             // Atomize constant strings in case they're later used in property lookup. 
 568             lvalp
->ident 
= makeIdentifier(m_buffer16
); 
 572             lvalp
->doubleValue 
= dval
; 
 577             fprintf(stderr
, "yylex: ERROR.\n"); 
 582             ASSERT(!"unhandled numeration value in switch"); 
 590 bool Lexer::isWhiteSpace() const 
 592     return m_current 
== '\t' || m_current 
== 0x0b || m_current 
== 0x0c || isSeparatorSpace(m_current
); 
 595 bool Lexer::isLineTerminator() 
 597     bool cr 
= (m_current 
== '\r'); 
 598     bool lf 
= (m_current 
== '\n'); 
 603     return cr 
|| lf 
|| m_current 
== 0x2028 || m_current 
== 0x2029; 
 606 bool Lexer::isIdentStart(int c
) 
 608     return isASCIIAlpha(c
) || c 
== '$' || c 
== '_' || (!isASCII(c
) && (category(c
) & (Letter_Uppercase 
| Letter_Lowercase 
| Letter_Titlecase 
| Letter_Modifier 
| Letter_Other
))); 
 611 bool Lexer::isIdentPart(int c
) 
 613     return isASCIIAlphanumeric(c
) || c 
== '$' || c 
== '_' || (!isASCII(c
) && (category(c
) & (Letter_Uppercase 
| Letter_Lowercase 
| Letter_Titlecase 
| Letter_Modifier 
| Letter_Other
 
 614                             | Mark_NonSpacing 
| Mark_SpacingCombining 
| Number_DecimalDigit 
| Punctuation_Connector
))); 
 617 static bool isDecimalDigit(int c
) 
 619     return isASCIIDigit(c
); 
 622 bool Lexer::isHexDigit(int c
) 
 624     return isASCIIHexDigit(c
);  
 627 bool Lexer::isOctalDigit(int c
) 
 629     return isASCIIOctalDigit(c
); 
 632 int Lexer::matchPunctuator(int& charPos
, int c1
, int c2
, int c3
, int c4
) 
 634     if (c1 
== '>' && c2 
== '>' && c3 
== '>' && c4 
== '=') { 
 638     if (c1 
== '=' && c2 
== '=' && c3 
== '=') { 
 642     if (c1 
== '!' && c2 
== '=' && c3 
== '=') { 
 646     if (c1 
== '>' && c2 
== '>' && c3 
== '>') { 
 650     if (c1 
== '<' && c2 
== '<' && c3 
== '=') { 
 654     if (c1 
== '>' && c2 
== '>' && c3 
== '=') { 
 658     if (c1 
== '<' && c2 
== '=') { 
 662     if (c1 
== '>' && c2 
== '=') { 
 666     if (c1 
== '!' && c2 
== '=') { 
 670     if (c1 
== '+' && c2 
== '+') { 
 676     if (c1 
== '-' && c2 
== '-') { 
 679             return AUTOMINUSMINUS
; 
 682     if (c1 
== '=' && c2 
== '=') { 
 686     if (c1 
== '+' && c2 
== '=') { 
 690     if (c1 
== '-' && c2 
== '=') { 
 694     if (c1 
== '*' && c2 
== '=') { 
 698     if (c1 
== '/' && c2 
== '=') { 
 702     if (c1 
== '&' && c2 
== '=') { 
 706     if (c1 
== '^' && c2 
== '=') { 
 710     if (c1 
== '%' && c2 
== '=') { 
 714     if (c1 
== '|' && c2 
== '=') { 
 718     if (c1 
== '<' && c2 
== '<') { 
 722     if (c1 
== '>' && c2 
== '>') { 
 726     if (c1 
== '&' && c2 
== '&') { 
 730     if (c1 
== '|' && c2 
== '|') { 
 759             return static_cast<int>(c1
); 
 761             charPos 
= m_currentOffset
; 
 765             charPos 
= m_currentOffset
; 
 773 unsigned short Lexer::singleEscape(unsigned short c
) 
 799 unsigned short Lexer::convertOctal(int c1
, int c2
, int c3
) 
 801     return static_cast<unsigned short>((c1 
- '0') * 64 + (c2 
- '0') * 8 + c3 
- '0'); 
 804 unsigned char Lexer::convertHex(int c
) 
 806     if (c 
>= '0' && c 
<= '9') 
 807         return static_cast<unsigned char>(c 
- '0'); 
 808     if (c 
>= 'a' && c 
<= 'f') 
 809         return static_cast<unsigned char>(c 
- 'a' + 10); 
 810     return static_cast<unsigned char>(c 
- 'A' + 10); 
 813 unsigned char Lexer::convertHex(int c1
, int c2
) 
 815     return ((convertHex(c1
) << 4) + convertHex(c2
)); 
 818 UChar 
Lexer::convertUnicode(int c1
, int c2
, int c3
, int c4
) 
 820     unsigned char highByte 
= (convertHex(c1
) << 4) + convertHex(c2
); 
 821     unsigned char lowByte 
= (convertHex(c3
) << 4) + convertHex(c4
); 
 822     return (highByte 
<< 8 | lowByte
); 
 825 void Lexer::record8(int c
) 
 829     m_buffer8
.append(static_cast<char>(c
)); 
 832 void Lexer::record16(int c
) 
 835     ASSERT(c 
<= USHRT_MAX
); 
 836     record16(UChar(static_cast<unsigned short>(c
))); 
 839 void Lexer::record16(UChar c
) 
 841     m_buffer16
.append(c
); 
 844 bool Lexer::scanRegExp() 
 847     bool lastWasEscape 
= false; 
 848     bool inBrackets 
= false; 
 851         if (isLineTerminator() || m_current 
== -1) 
 853         else if (m_current 
!= '/' || lastWasEscape 
== true || inBrackets 
== true) { 
 854             // keep track of '[' and ']' 
 855             if (!lastWasEscape
) { 
 856                 if ( m_current 
== '[' && !inBrackets 
) 
 858                 if ( m_current 
== ']' && inBrackets 
) 
 863             !lastWasEscape 
&& (m_current 
== '\\'); 
 864         } else { // end of regexp 
 865             m_pattern 
= UString(m_buffer16
); 
 873     while (isIdentPart(m_current
)) { 
 877     m_flags 
= UString(m_buffer16
); 
 884     m_identifiers
.clear(); 
 886     Vector
<char> newBuffer8
; 
 887     newBuffer8
.reserveInitialCapacity(initialReadBufferCapacity
); 
 888     m_buffer8
.swap(newBuffer8
); 
 890     Vector
<UChar
> newBuffer16
; 
 891     newBuffer16
.reserveInitialCapacity(initialReadBufferCapacity
); 
 892     m_buffer16
.swap(newBuffer16
); 
 894     m_isReparsing 
= false;