2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
26 #include "JSFunction.h"
27 #include "JSGlobalObjectFunctions.h"
34 #include <wtf/Assertions.h>
37 using namespace Unicode
;
39 // We can't specify the namespace in yacc's C output, so do it here instead.
44 #include "Lexer.lut.h"
48 static const UChar byteOrderMark
= 0xFEFF;
50 Lexer::Lexer(JSGlobalData
* globalData
)
51 : m_isReparsing(false)
52 , m_globalData(globalData
)
53 , m_keywordTable(JSC::mainTable
)
55 m_buffer8
.reserveInitialCapacity(initialReadBufferCapacity
);
56 m_buffer16
.reserveInitialCapacity(initialReadBufferCapacity
);
61 m_keywordTable
.deleteTable();
64 inline const UChar
* Lexer::currentCharacter() const
69 inline int Lexer::currentOffset() const
71 return currentCharacter() - m_codeStart
;
74 ALWAYS_INLINE
void Lexer::shift1()
79 if (LIKELY(m_code
< m_codeEnd
))
87 ALWAYS_INLINE
void Lexer::shift2()
91 if (LIKELY(m_code
+ 1 < m_codeEnd
)) {
95 m_next2
= m_code
< m_codeEnd
? m_code
[0] : -1;
102 ALWAYS_INLINE
void Lexer::shift3()
105 if (LIKELY(m_code
+ 2 < m_codeEnd
)) {
110 m_next1
= m_code
< m_codeEnd
? m_code
[0] : -1;
111 m_next2
= m_code
+ 1 < m_codeEnd
? m_code
[1] : -1;
118 ALWAYS_INLINE
void Lexer::shift4()
120 if (LIKELY(m_code
+ 3 < m_codeEnd
)) {
121 m_current
= m_code
[0];
126 m_current
= m_code
< m_codeEnd
? m_code
[0] : -1;
127 m_next1
= m_code
+ 1 < m_codeEnd
? m_code
[1] : -1;
128 m_next2
= m_code
+ 2 < m_codeEnd
? m_code
[2] : -1;
135 void Lexer::setCode(const SourceCode
& source
, ParserArena
& arena
)
137 m_arena
= &arena
.identifierArena();
139 m_lineNumber
= source
.firstLine();
143 const UChar
* data
= source
.provider()->data();
147 m_code
= data
+ source
.startOffset();
148 m_codeEnd
= data
+ source
.endOffset();
150 m_atLineStart
= true;
152 // ECMA-262 calls for stripping all Cf characters, but we only strip BOM characters.
153 // See <https://bugs.webkit.org/show_bug.cgi?id=4931> for details.
154 if (source
.provider()->hasBOMs()) {
155 for (const UChar
* p
= m_codeStart
; p
< m_codeEnd
; ++p
) {
156 if (UNLIKELY(*p
== byteOrderMark
)) {
157 copyCodeWithoutBOMs();
163 // Read the first characters into the 4-character buffer.
165 ASSERT(currentOffset() == source
.startOffset());
168 void Lexer::copyCodeWithoutBOMs()
170 // Note: In this case, the character offset data for debugging will be incorrect.
171 // If it's important to correctly debug code with extraneous BOMs, then the caller
172 // should strip the BOMs when creating the SourceProvider object and do its own
173 // mapping of offsets within the stripped text to original text offset.
175 m_codeWithoutBOMs
.reserveCapacity(m_codeEnd
- m_code
);
176 for (const UChar
* p
= m_code
; p
< m_codeEnd
; ++p
) {
178 if (c
!= byteOrderMark
)
179 m_codeWithoutBOMs
.append(c
);
181 ptrdiff_t startDelta
= m_codeStart
- m_code
;
182 m_code
= m_codeWithoutBOMs
.data();
183 m_codeStart
= m_code
+ startDelta
;
184 m_codeEnd
= m_codeWithoutBOMs
.data() + m_codeWithoutBOMs
.size();
187 void Lexer::shiftLineTerminator()
189 ASSERT(isLineTerminator(m_current
));
191 // Allow both CRLF and LFCR.
192 if (m_current
+ m_next1
== '\n' + '\r')
200 ALWAYS_INLINE
const Identifier
* Lexer::makeIdentifier(const UChar
* characters
, size_t length
)
202 return &m_arena
->makeIdentifier(m_globalData
, characters
, length
);
205 inline bool Lexer::lastTokenWasRestrKeyword() const
207 return m_lastToken
== CONTINUE
|| m_lastToken
== BREAK
|| m_lastToken
== RETURN
|| m_lastToken
== THROW
;
210 static NEVER_INLINE
bool isNonASCIIIdentStart(int c
)
212 return category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
);
215 static inline bool isIdentStart(int c
)
217 return isASCII(c
) ? isASCIIAlpha(c
) || c
== '$' || c
== '_' : isNonASCIIIdentStart(c
);
220 static NEVER_INLINE
bool isNonASCIIIdentPart(int c
)
222 return category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
223 | Mark_NonSpacing
| Mark_SpacingCombining
| Number_DecimalDigit
| Punctuation_Connector
);
226 static inline bool isIdentPart(int c
)
228 return isASCII(c
) ? isASCIIAlphanumeric(c
) || c
== '$' || c
== '_' : isNonASCIIIdentPart(c
);
231 static inline int singleEscape(int c
)
251 inline void Lexer::record8(int c
)
255 m_buffer8
.append(static_cast<char>(c
));
258 inline void Lexer::record16(UChar c
)
260 m_buffer16
.append(c
);
263 inline void Lexer::record16(int c
)
266 ASSERT(c
<= USHRT_MAX
);
267 record16(UChar(static_cast<unsigned short>(c
)));
270 int Lexer::lex(void* p1
, void* p2
)
273 ASSERT(m_buffer8
.isEmpty());
274 ASSERT(m_buffer16
.isEmpty());
276 YYSTYPE
* lvalp
= static_cast<YYSTYPE
*>(p1
);
277 YYLTYPE
* llocp
= static_cast<YYLTYPE
*>(p2
);
279 m_terminator
= false;
282 while (isWhiteSpace(m_current
))
285 int startOffset
= currentOffset();
287 if (m_current
== -1) {
288 if (!m_terminator
&& !m_delimited
&& !m_isReparsing
) {
289 // automatic semicolon insertion if program incomplete
299 if (m_next1
== '>' && m_next2
== '>') {
300 if (m_next3
== '=') {
302 token
= URSHIFTEQUAL
;
309 if (m_next1
== '>') {
310 if (m_next2
== '=') {
319 if (m_next1
== '=') {
328 if (m_next1
== '=') {
329 if (m_next2
== '=') {
342 if (m_next1
== '=') {
343 if (m_next2
== '=') {
356 if (m_next1
== '!' && m_next2
== '-' && m_next3
== '-') {
357 // <!-- marks the beginning of a line comment (for www usage)
359 goto inSingleLineComment
;
361 if (m_next1
== '<') {
362 if (m_next2
== '=') {
371 if (m_next1
== '=') {
380 if (m_next1
== '+') {
383 token
= AUTOPLUSPLUS
;
389 if (m_next1
== '=') {
398 if (m_next1
== '-') {
399 if (m_atLineStart
&& m_next2
== '>') {
401 goto inSingleLineComment
;
405 token
= AUTOMINUSMINUS
;
411 if (m_next1
== '=') {
420 if (m_next1
== '=') {
429 if (m_next1
== '/') {
431 goto inSingleLineComment
;
434 goto inMultiLineComment
;
435 if (m_next1
== '=') {
444 if (m_next1
== '&') {
449 if (m_next1
== '=') {
458 if (m_next1
== '=') {
467 if (m_next1
== '=') {
476 if (m_next1
== '=') {
481 if (m_next1
== '|') {
490 if (isASCIIDigit(m_next1
)) {
493 goto inNumberAfterDecimalPoint
;
515 lvalp
->intValue
= currentOffset();
520 lvalp
->intValue
= currentOffset();
526 goto startIdentifierWithBackslash
;
528 goto startNumberWithZeroDigit
;
543 if (isIdentStart(m_current
))
544 goto startIdentifierOrKeyword
;
545 if (isLineTerminator(m_current
)) {
546 shiftLineTerminator();
547 m_atLineStart
= true;
549 if (lastTokenWasRestrKeyword()) {
558 m_atLineStart
= false;
562 int stringQuoteCharacter
= m_current
;
565 const UChar
* stringStart
= currentCharacter();
566 while (m_current
!= stringQuoteCharacter
) {
567 // Fast check for characters that require special handling.
568 // Catches -1, \n, \r, \, 0x2028, and 0x2029 as efficiently
569 // as possible, and lets through all common ASCII characters.
570 if (UNLIKELY(m_current
== '\\') || UNLIKELY(((static_cast<unsigned>(m_current
) - 0xE) & 0x2000))) {
571 m_buffer16
.append(stringStart
, currentCharacter() - stringStart
);
576 lvalp
->ident
= makeIdentifier(stringStart
, currentCharacter() - stringStart
);
578 m_atLineStart
= false;
584 while (m_current
!= stringQuoteCharacter
) {
585 if (m_current
== '\\')
586 goto inStringEscapeSequence
;
587 if (UNLIKELY(isLineTerminator(m_current
)))
589 if (UNLIKELY(m_current
== -1))
596 inStringEscapeSequence
:
598 if (m_current
== 'x') {
600 if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
)) {
601 record16(convertHex(m_current
, m_next1
));
606 if (m_current
== stringQuoteCharacter
)
610 if (m_current
== 'u') {
612 if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
) && isASCIIHexDigit(m_next2
) && isASCIIHexDigit(m_next3
)) {
613 record16(convertUnicode(m_current
, m_next1
, m_next2
, m_next3
));
617 if (m_current
== stringQuoteCharacter
) {
623 if (isASCIIOctalDigit(m_current
)) {
624 if (m_current
>= '0' && m_current
<= '3' && isASCIIOctalDigit(m_next1
) && isASCIIOctalDigit(m_next2
)) {
625 record16((m_current
- '0') * 64 + (m_next1
- '0') * 8 + m_next2
- '0');
629 if (isASCIIOctalDigit(m_next1
)) {
630 record16((m_current
- '0') * 8 + m_next1
- '0');
634 record16(m_current
- '0');
638 if (isLineTerminator(m_current
)) {
639 shiftLineTerminator();
644 record16(singleEscape(m_current
));
649 startIdentifierWithBackslash
:
651 if (UNLIKELY(m_current
!= 'u'))
654 if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
)))
656 token
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
);
657 if (UNLIKELY(!isIdentStart(token
)))
659 goto inIdentifierAfterCharacterCheck
;
661 startIdentifierOrKeyword
: {
662 const UChar
* identifierStart
= currentCharacter();
664 while (isIdentPart(m_current
))
666 if (LIKELY(m_current
!= '\\')) {
667 lvalp
->ident
= makeIdentifier(identifierStart
, currentCharacter() - identifierStart
);
668 goto doneIdentifierOrKeyword
;
670 m_buffer16
.append(identifierStart
, currentCharacter() - identifierStart
);
675 if (UNLIKELY(m_current
!= 'u'))
678 if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
)))
680 token
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
);
681 if (UNLIKELY(!isIdentPart(token
)))
683 inIdentifierAfterCharacterCheck
:
687 while (isIdentPart(m_current
)) {
691 } while (UNLIKELY(m_current
== '\\'));
695 while (!isLineTerminator(m_current
)) {
696 if (UNLIKELY(m_current
== -1))
700 shiftLineTerminator();
701 m_atLineStart
= true;
703 if (lastTokenWasRestrKeyword())
709 while (m_current
!= '*' || m_next1
!= '/') {
710 if (isLineTerminator(m_current
))
711 shiftLineTerminator();
714 if (UNLIKELY(m_current
== -1))
719 m_atLineStart
= false;
722 startNumberWithZeroDigit
:
724 if ((m_current
| 0x20) == 'x' && isASCIIHexDigit(m_next1
)) {
728 if (m_current
== '.') {
732 goto inNumberAfterDecimalPoint
;
734 if ((m_current
| 0x20) == 'e') {
738 goto inExponentIndicator
;
740 if (isASCIIOctalDigit(m_current
))
742 if (isASCIIDigit(m_current
))
744 lvalp
->doubleValue
= 0;
747 inNumberAfterDecimalPoint
:
748 while (isASCIIDigit(m_current
)) {
752 if ((m_current
| 0x20) == 'e') {
755 goto inExponentIndicator
;
760 if (m_current
== '+' || m_current
== '-') {
764 if (!isASCIIDigit(m_current
))
769 } while (isASCIIDigit(m_current
));
776 } while (isASCIIOctalDigit(m_current
));
777 if (isASCIIDigit(m_current
))
782 const char* end
= m_buffer8
.end();
783 for (const char* p
= m_buffer8
.data(); p
< end
; ++p
) {
787 if (dval
>= mantissaOverflowLowerBound
)
788 dval
= parseIntOverflow(m_buffer8
.data(), end
- m_buffer8
.data(), 8);
792 lvalp
->doubleValue
= dval
;
800 } while (isASCIIHexDigit(m_current
));
804 const char* end
= m_buffer8
.end();
805 for (const char* p
= m_buffer8
.data(); p
< end
; ++p
) {
807 dval
+= toASCIIHexValue(*p
);
809 if (dval
>= mantissaOverflowLowerBound
)
810 dval
= parseIntOverflow(m_buffer8
.data(), end
- m_buffer8
.data(), 16);
814 lvalp
->doubleValue
= dval
;
821 while (isASCIIDigit(m_current
)) {
825 if (m_current
== '.') {
828 goto inNumberAfterDecimalPoint
;
830 if ((m_current
| 0x20) == 'e') {
833 goto inExponentIndicator
;
836 // Fall through into doneNumber.
839 // Null-terminate string for strtod.
840 m_buffer8
.append('\0');
841 lvalp
->doubleValue
= WTF::strtod(m_buffer8
.data(), 0);
844 // Fall through into doneNumeric.
847 // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
848 if (UNLIKELY(isIdentStart(m_current
)))
851 m_atLineStart
= false;
862 m_atLineStart
= false;
864 lvalp
->ident
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
865 m_buffer16
.resize(0);
869 doneIdentifierOrKeyword
: {
870 m_atLineStart
= false;
872 m_buffer16
.resize(0);
873 const HashEntry
* entry
= m_keywordTable
.entry(m_globalData
, *lvalp
->ident
);
874 token
= entry
? entry
->lexerValue() : IDENT
;
879 // Atomize constant strings in case they're later used in property lookup.
881 m_atLineStart
= false;
883 lvalp
->ident
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
884 m_buffer16
.resize(0);
887 // Fall through into returnToken.
890 int lineNumber
= m_lineNumber
;
891 llocp
->first_line
= lineNumber
;
892 llocp
->last_line
= lineNumber
;
893 llocp
->first_column
= startOffset
;
894 llocp
->last_column
= currentOffset();
905 bool Lexer::scanRegExp(const Identifier
*& pattern
, const Identifier
*& flags
, UChar patternPrefix
)
907 ASSERT(m_buffer16
.isEmpty());
909 bool lastWasEscape
= false;
910 bool inBrackets
= false;
913 ASSERT(!isLineTerminator(patternPrefix
));
914 ASSERT(patternPrefix
!= '/');
915 ASSERT(patternPrefix
!= '[');
916 record16(patternPrefix
);
920 int current
= m_current
;
922 if (isLineTerminator(current
) || current
== -1) {
923 m_buffer16
.resize(0);
929 if (current
== '/' && !lastWasEscape
&& !inBrackets
)
935 lastWasEscape
= false;
947 lastWasEscape
= true;
952 pattern
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
953 m_buffer16
.resize(0);
955 while (isIdentPart(m_current
)) {
960 flags
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
961 m_buffer16
.resize(0);
966 bool Lexer::skipRegExp()
968 bool lastWasEscape
= false;
969 bool inBrackets
= false;
972 int current
= m_current
;
974 if (isLineTerminator(current
) || current
== -1)
979 if (current
== '/' && !lastWasEscape
&& !inBrackets
)
983 lastWasEscape
= false;
995 lastWasEscape
= true;
1000 while (isIdentPart(m_current
))
1009 m_codeWithoutBOMs
.clear();
1011 Vector
<char> newBuffer8
;
1012 newBuffer8
.reserveInitialCapacity(initialReadBufferCapacity
);
1013 m_buffer8
.swap(newBuffer8
);
1015 Vector
<UChar
> newBuffer16
;
1016 newBuffer16
.reserveInitialCapacity(initialReadBufferCapacity
);
1017 m_buffer16
.swap(newBuffer16
);
1019 m_isReparsing
= false;
1022 SourceCode
Lexer::sourceCode(int openBrace
, int closeBrace
, int firstLine
)
1024 if (m_codeWithoutBOMs
.isEmpty())
1025 return SourceCode(m_source
->provider(), openBrace
, closeBrace
+ 1, firstLine
);
1027 const UChar
* data
= m_source
->provider()->data();
1029 ASSERT(openBrace
< closeBrace
);
1031 int numBOMsBeforeOpenBrace
= 0;
1032 int numBOMsBetweenBraces
= 0;
1035 for (i
= m_source
->startOffset(); i
< openBrace
; ++i
)
1036 numBOMsBeforeOpenBrace
+= data
[i
] == byteOrderMark
;
1037 for (; i
< closeBrace
; ++i
)
1038 numBOMsBetweenBraces
+= data
[i
] == byteOrderMark
;
1040 return SourceCode(m_source
->provider(), openBrace
+ numBOMsBeforeOpenBrace
,
1041 closeBrace
+ numBOMsBeforeOpenBrace
+ numBOMsBetweenBraces
+ 1, firstLine
);