2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
26 #include "JSFunction.h"
27 #include "JSGlobalObjectFunctions.h"
34 #include <wtf/Assertions.h>
37 using namespace Unicode
;
39 // We can't specify the namespace in yacc's C output, so do it here instead.
44 #include "Lexer.lut.h"
48 static const UChar byteOrderMark
= 0xFEFF;
50 Lexer::Lexer(JSGlobalData
* globalData
)
51 : m_isReparsing(false)
52 , m_globalData(globalData
)
53 , m_keywordTable(JSC::mainTable
)
59 m_keywordTable
.deleteTable();
62 inline const UChar
* Lexer::currentCharacter() const
67 inline int Lexer::currentOffset() const
69 return currentCharacter() - m_codeStart
;
72 ALWAYS_INLINE
void Lexer::shift1()
77 if (LIKELY(m_code
< m_codeEnd
))
85 ALWAYS_INLINE
void Lexer::shift2()
89 if (LIKELY(m_code
+ 1 < m_codeEnd
)) {
93 m_next2
= m_code
< m_codeEnd
? m_code
[0] : -1;
100 ALWAYS_INLINE
void Lexer::shift3()
103 if (LIKELY(m_code
+ 2 < m_codeEnd
)) {
108 m_next1
= m_code
< m_codeEnd
? m_code
[0] : -1;
109 m_next2
= m_code
+ 1 < m_codeEnd
? m_code
[1] : -1;
116 ALWAYS_INLINE
void Lexer::shift4()
118 if (LIKELY(m_code
+ 3 < m_codeEnd
)) {
119 m_current
= m_code
[0];
124 m_current
= m_code
< m_codeEnd
? m_code
[0] : -1;
125 m_next1
= m_code
+ 1 < m_codeEnd
? m_code
[1] : -1;
126 m_next2
= m_code
+ 2 < m_codeEnd
? m_code
[2] : -1;
133 void Lexer::setCode(const SourceCode
& source
, ParserArena
& arena
)
135 m_arena
= &arena
.identifierArena();
137 m_lineNumber
= source
.firstLine();
141 const UChar
* data
= source
.provider()->data();
145 m_code
= data
+ source
.startOffset();
146 m_codeEnd
= data
+ source
.endOffset();
148 m_atLineStart
= true;
150 m_buffer8
.reserveInitialCapacity(initialReadBufferCapacity
);
151 m_buffer16
.reserveInitialCapacity((m_codeEnd
- m_code
) / 2);
153 // ECMA-262 calls for stripping all Cf characters, but we only strip BOM characters.
154 // See <https://bugs.webkit.org/show_bug.cgi?id=4931> for details.
155 if (source
.provider()->hasBOMs()) {
156 for (const UChar
* p
= m_codeStart
; p
< m_codeEnd
; ++p
) {
157 if (UNLIKELY(*p
== byteOrderMark
)) {
158 copyCodeWithoutBOMs();
164 // Read the first characters into the 4-character buffer.
166 ASSERT(currentOffset() == source
.startOffset());
169 void Lexer::copyCodeWithoutBOMs()
171 // Note: In this case, the character offset data for debugging will be incorrect.
172 // If it's important to correctly debug code with extraneous BOMs, then the caller
173 // should strip the BOMs when creating the SourceProvider object and do its own
174 // mapping of offsets within the stripped text to original text offset.
176 m_codeWithoutBOMs
.reserveCapacity(m_codeEnd
- m_code
);
177 for (const UChar
* p
= m_code
; p
< m_codeEnd
; ++p
) {
179 if (c
!= byteOrderMark
)
180 m_codeWithoutBOMs
.append(c
);
182 ptrdiff_t startDelta
= m_codeStart
- m_code
;
183 m_code
= m_codeWithoutBOMs
.data();
184 m_codeStart
= m_code
+ startDelta
;
185 m_codeEnd
= m_codeWithoutBOMs
.data() + m_codeWithoutBOMs
.size();
188 void Lexer::shiftLineTerminator()
190 ASSERT(isLineTerminator(m_current
));
192 // Allow both CRLF and LFCR.
193 if (m_current
+ m_next1
== '\n' + '\r')
201 ALWAYS_INLINE
const Identifier
* Lexer::makeIdentifier(const UChar
* characters
, size_t length
)
203 return &m_arena
->makeIdentifier(m_globalData
, characters
, length
);
206 inline bool Lexer::lastTokenWasRestrKeyword() const
208 return m_lastToken
== CONTINUE
|| m_lastToken
== BREAK
|| m_lastToken
== RETURN
|| m_lastToken
== THROW
;
211 static NEVER_INLINE
bool isNonASCIIIdentStart(int c
)
213 return category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
);
216 static inline bool isIdentStart(int c
)
218 return isASCII(c
) ? isASCIIAlpha(c
) || c
== '$' || c
== '_' : isNonASCIIIdentStart(c
);
221 static NEVER_INLINE
bool isNonASCIIIdentPart(int c
)
223 return category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
224 | Mark_NonSpacing
| Mark_SpacingCombining
| Number_DecimalDigit
| Punctuation_Connector
);
227 static inline bool isIdentPart(int c
)
229 return isASCII(c
) ? isASCIIAlphanumeric(c
) || c
== '$' || c
== '_' : isNonASCIIIdentPart(c
);
232 static inline int singleEscape(int c
)
252 inline void Lexer::record8(int c
)
256 m_buffer8
.append(static_cast<char>(c
));
259 inline void Lexer::record16(UChar c
)
261 m_buffer16
.append(c
);
264 inline void Lexer::record16(int c
)
267 ASSERT(c
<= USHRT_MAX
);
268 record16(UChar(static_cast<unsigned short>(c
)));
271 int Lexer::lex(void* p1
, void* p2
)
274 ASSERT(m_buffer8
.isEmpty());
275 ASSERT(m_buffer16
.isEmpty());
277 YYSTYPE
* lvalp
= static_cast<YYSTYPE
*>(p1
);
278 YYLTYPE
* llocp
= static_cast<YYLTYPE
*>(p2
);
280 m_terminator
= false;
283 while (isWhiteSpace(m_current
))
286 int startOffset
= currentOffset();
288 if (m_current
== -1) {
289 if (!m_terminator
&& !m_delimited
&& !m_isReparsing
) {
290 // automatic semicolon insertion if program incomplete
300 if (m_next1
== '>' && m_next2
== '>') {
301 if (m_next3
== '=') {
303 token
= URSHIFTEQUAL
;
310 if (m_next1
== '>') {
311 if (m_next2
== '=') {
320 if (m_next1
== '=') {
329 if (m_next1
== '=') {
330 if (m_next2
== '=') {
343 if (m_next1
== '=') {
344 if (m_next2
== '=') {
357 if (m_next1
== '!' && m_next2
== '-' && m_next3
== '-') {
358 // <!-- marks the beginning of a line comment (for www usage)
360 goto inSingleLineComment
;
362 if (m_next1
== '<') {
363 if (m_next2
== '=') {
372 if (m_next1
== '=') {
381 if (m_next1
== '+') {
384 token
= AUTOPLUSPLUS
;
390 if (m_next1
== '=') {
399 if (m_next1
== '-') {
400 if (m_atLineStart
&& m_next2
== '>') {
402 goto inSingleLineComment
;
406 token
= AUTOMINUSMINUS
;
412 if (m_next1
== '=') {
421 if (m_next1
== '=') {
430 if (m_next1
== '/') {
432 goto inSingleLineComment
;
435 goto inMultiLineComment
;
436 if (m_next1
== '=') {
445 if (m_next1
== '&') {
450 if (m_next1
== '=') {
459 if (m_next1
== '=') {
468 if (m_next1
== '=') {
477 if (m_next1
== '=') {
482 if (m_next1
== '|') {
491 if (isASCIIDigit(m_next1
)) {
494 goto inNumberAfterDecimalPoint
;
516 lvalp
->intValue
= currentOffset();
521 lvalp
->intValue
= currentOffset();
527 goto startIdentifierWithBackslash
;
529 goto startNumberWithZeroDigit
;
544 if (isIdentStart(m_current
))
545 goto startIdentifierOrKeyword
;
546 if (isLineTerminator(m_current
)) {
547 shiftLineTerminator();
548 m_atLineStart
= true;
550 if (lastTokenWasRestrKeyword()) {
559 m_atLineStart
= false;
563 int stringQuoteCharacter
= m_current
;
566 const UChar
* stringStart
= currentCharacter();
567 while (m_current
!= stringQuoteCharacter
) {
568 // Fast check for characters that require special handling.
569 // Catches -1, \n, \r, \, 0x2028, and 0x2029 as efficiently
570 // as possible, and lets through all common ASCII characters.
571 if (UNLIKELY(m_current
== '\\') || UNLIKELY(((static_cast<unsigned>(m_current
) - 0xE) & 0x2000))) {
572 m_buffer16
.append(stringStart
, currentCharacter() - stringStart
);
577 lvalp
->ident
= makeIdentifier(stringStart
, currentCharacter() - stringStart
);
579 m_atLineStart
= false;
585 while (m_current
!= stringQuoteCharacter
) {
586 if (m_current
== '\\')
587 goto inStringEscapeSequence
;
588 if (UNLIKELY(isLineTerminator(m_current
)))
590 if (UNLIKELY(m_current
== -1))
597 inStringEscapeSequence
:
599 if (m_current
== 'x') {
601 if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
)) {
602 record16(convertHex(m_current
, m_next1
));
607 if (m_current
== stringQuoteCharacter
)
611 if (m_current
== 'u') {
613 if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
) && isASCIIHexDigit(m_next2
) && isASCIIHexDigit(m_next3
)) {
614 record16(convertUnicode(m_current
, m_next1
, m_next2
, m_next3
));
618 if (m_current
== stringQuoteCharacter
) {
624 if (isASCIIOctalDigit(m_current
)) {
625 if (m_current
>= '0' && m_current
<= '3' && isASCIIOctalDigit(m_next1
) && isASCIIOctalDigit(m_next2
)) {
626 record16((m_current
- '0') * 64 + (m_next1
- '0') * 8 + m_next2
- '0');
630 if (isASCIIOctalDigit(m_next1
)) {
631 record16((m_current
- '0') * 8 + m_next1
- '0');
635 record16(m_current
- '0');
639 if (isLineTerminator(m_current
)) {
640 shiftLineTerminator();
645 record16(singleEscape(m_current
));
650 startIdentifierWithBackslash
:
652 if (UNLIKELY(m_current
!= 'u'))
655 if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
)))
657 token
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
);
658 if (UNLIKELY(!isIdentStart(token
)))
660 goto inIdentifierAfterCharacterCheck
;
662 startIdentifierOrKeyword
: {
663 const UChar
* identifierStart
= currentCharacter();
665 while (isIdentPart(m_current
))
667 if (LIKELY(m_current
!= '\\')) {
668 lvalp
->ident
= makeIdentifier(identifierStart
, currentCharacter() - identifierStart
);
669 goto doneIdentifierOrKeyword
;
671 m_buffer16
.append(identifierStart
, currentCharacter() - identifierStart
);
676 if (UNLIKELY(m_current
!= 'u'))
679 if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
)))
681 token
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
);
682 if (UNLIKELY(!isIdentPart(token
)))
684 inIdentifierAfterCharacterCheck
:
688 while (isIdentPart(m_current
)) {
692 } while (UNLIKELY(m_current
== '\\'));
696 while (!isLineTerminator(m_current
)) {
697 if (UNLIKELY(m_current
== -1))
701 shiftLineTerminator();
702 m_atLineStart
= true;
704 if (lastTokenWasRestrKeyword())
710 while (m_current
!= '*' || m_next1
!= '/') {
711 if (isLineTerminator(m_current
))
712 shiftLineTerminator();
715 if (UNLIKELY(m_current
== -1))
720 m_atLineStart
= false;
723 startNumberWithZeroDigit
:
725 if ((m_current
| 0x20) == 'x' && isASCIIHexDigit(m_next1
)) {
729 if (m_current
== '.') {
733 goto inNumberAfterDecimalPoint
;
735 if ((m_current
| 0x20) == 'e') {
739 goto inExponentIndicator
;
741 if (isASCIIOctalDigit(m_current
))
743 if (isASCIIDigit(m_current
))
745 lvalp
->doubleValue
= 0;
748 inNumberAfterDecimalPoint
:
749 while (isASCIIDigit(m_current
)) {
753 if ((m_current
| 0x20) == 'e') {
756 goto inExponentIndicator
;
761 if (m_current
== '+' || m_current
== '-') {
765 if (!isASCIIDigit(m_current
))
770 } while (isASCIIDigit(m_current
));
777 } while (isASCIIOctalDigit(m_current
));
778 if (isASCIIDigit(m_current
))
783 const char* end
= m_buffer8
.end();
784 for (const char* p
= m_buffer8
.data(); p
< end
; ++p
) {
788 if (dval
>= mantissaOverflowLowerBound
)
789 dval
= parseIntOverflow(m_buffer8
.data(), end
- m_buffer8
.data(), 8);
793 lvalp
->doubleValue
= dval
;
801 } while (isASCIIHexDigit(m_current
));
805 const char* end
= m_buffer8
.end();
806 for (const char* p
= m_buffer8
.data(); p
< end
; ++p
) {
808 dval
+= toASCIIHexValue(*p
);
810 if (dval
>= mantissaOverflowLowerBound
)
811 dval
= parseIntOverflow(m_buffer8
.data(), end
- m_buffer8
.data(), 16);
815 lvalp
->doubleValue
= dval
;
822 while (isASCIIDigit(m_current
)) {
826 if (m_current
== '.') {
829 goto inNumberAfterDecimalPoint
;
831 if ((m_current
| 0x20) == 'e') {
834 goto inExponentIndicator
;
837 // Fall through into doneNumber.
840 // Null-terminate string for strtod.
841 m_buffer8
.append('\0');
842 lvalp
->doubleValue
= WTF::strtod(m_buffer8
.data(), 0);
845 // Fall through into doneNumeric.
848 // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
849 if (UNLIKELY(isIdentStart(m_current
)))
852 m_atLineStart
= false;
863 m_atLineStart
= false;
865 lvalp
->ident
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
866 m_buffer16
.resize(0);
870 doneIdentifierOrKeyword
: {
871 m_atLineStart
= false;
873 m_buffer16
.resize(0);
874 const HashEntry
* entry
= m_keywordTable
.entry(m_globalData
, *lvalp
->ident
);
875 token
= entry
? entry
->lexerValue() : IDENT
;
880 // Atomize constant strings in case they're later used in property lookup.
882 m_atLineStart
= false;
884 lvalp
->ident
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
885 m_buffer16
.resize(0);
888 // Fall through into returnToken.
891 int lineNumber
= m_lineNumber
;
892 llocp
->first_line
= lineNumber
;
893 llocp
->last_line
= lineNumber
;
894 llocp
->first_column
= startOffset
;
895 llocp
->last_column
= currentOffset();
906 bool Lexer::scanRegExp(const Identifier
*& pattern
, const Identifier
*& flags
, UChar patternPrefix
)
908 ASSERT(m_buffer16
.isEmpty());
910 bool lastWasEscape
= false;
911 bool inBrackets
= false;
914 ASSERT(!isLineTerminator(patternPrefix
));
915 ASSERT(patternPrefix
!= '/');
916 ASSERT(patternPrefix
!= '[');
917 record16(patternPrefix
);
921 int current
= m_current
;
923 if (isLineTerminator(current
) || current
== -1) {
924 m_buffer16
.resize(0);
930 if (current
== '/' && !lastWasEscape
&& !inBrackets
)
936 lastWasEscape
= false;
948 lastWasEscape
= true;
953 pattern
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
954 m_buffer16
.resize(0);
956 while (isIdentPart(m_current
)) {
961 flags
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
962 m_buffer16
.resize(0);
967 bool Lexer::skipRegExp()
969 bool lastWasEscape
= false;
970 bool inBrackets
= false;
973 int current
= m_current
;
975 if (isLineTerminator(current
) || current
== -1)
980 if (current
== '/' && !lastWasEscape
&& !inBrackets
)
984 lastWasEscape
= false;
996 lastWasEscape
= true;
1001 while (isIdentPart(m_current
))
1010 m_codeWithoutBOMs
.clear();
1012 Vector
<char> newBuffer8
;
1013 m_buffer8
.swap(newBuffer8
);
1015 Vector
<UChar
> newBuffer16
;
1016 m_buffer16
.swap(newBuffer16
);
1018 m_isReparsing
= false;
1021 SourceCode
Lexer::sourceCode(int openBrace
, int closeBrace
, int firstLine
)
1023 if (m_codeWithoutBOMs
.isEmpty())
1024 return SourceCode(m_source
->provider(), openBrace
, closeBrace
+ 1, firstLine
);
1026 const UChar
* data
= m_source
->provider()->data();
1028 ASSERT(openBrace
< closeBrace
);
1030 for (i
= m_source
->startOffset(); i
< openBrace
; ++i
) {
1031 if (data
[i
] == byteOrderMark
) {
1036 for (; i
< closeBrace
; ++i
) {
1037 if (data
[i
] == byteOrderMark
)
1041 ASSERT(openBrace
< closeBrace
);
1043 return SourceCode(m_source
->provider(), openBrace
, closeBrace
+ 1, firstLine
);