2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
26 #include "JSFunction.h"
27 #include "JSGlobalObjectFunctions.h"
34 #include <wtf/Assertions.h>
37 using namespace Unicode
;
39 // We can't specify the namespace in yacc's C output, so do it here instead.
47 #include "Lexer.lut.h"
49 // A bridge for yacc from the C world to the C++ world.
50 int jscyylex(void* lvalp
, void* llocp
, void* globalData
)
52 return static_cast<JSGlobalData
*>(globalData
)->lexer
->lex(lvalp
, llocp
);
57 static const UChar byteOrderMark
= 0xFEFF;
59 Lexer::Lexer(JSGlobalData
* globalData
)
60 : m_isReparsing(false)
61 , m_globalData(globalData
)
62 , m_keywordTable(JSC::mainTable
)
64 m_buffer8
.reserveInitialCapacity(initialReadBufferCapacity
);
65 m_buffer16
.reserveInitialCapacity(initialReadBufferCapacity
);
70 m_keywordTable
.deleteTable();
73 inline const UChar
* Lexer::currentCharacter() const
78 inline int Lexer::currentOffset() const
80 return currentCharacter() - m_codeStart
;
83 ALWAYS_INLINE
void Lexer::shift1()
88 if (LIKELY(m_code
< m_codeEnd
))
96 ALWAYS_INLINE
void Lexer::shift2()
100 if (LIKELY(m_code
+ 1 < m_codeEnd
)) {
104 m_next2
= m_code
< m_codeEnd
? m_code
[0] : -1;
111 ALWAYS_INLINE
void Lexer::shift3()
114 if (LIKELY(m_code
+ 2 < m_codeEnd
)) {
119 m_next1
= m_code
< m_codeEnd
? m_code
[0] : -1;
120 m_next2
= m_code
+ 1 < m_codeEnd
? m_code
[1] : -1;
127 ALWAYS_INLINE
void Lexer::shift4()
129 if (LIKELY(m_code
+ 3 < m_codeEnd
)) {
130 m_current
= m_code
[0];
135 m_current
= m_code
< m_codeEnd
? m_code
[0] : -1;
136 m_next1
= m_code
+ 1 < m_codeEnd
? m_code
[1] : -1;
137 m_next2
= m_code
+ 2 < m_codeEnd
? m_code
[2] : -1;
144 void Lexer::setCode(const SourceCode
& source
)
146 m_lineNumber
= source
.firstLine();
150 const UChar
* data
= source
.provider()->data();
154 m_code
= data
+ source
.startOffset();
155 m_codeEnd
= data
+ source
.endOffset();
157 m_atLineStart
= true;
159 // ECMA-262 calls for stripping all Cf characters, but we only strip BOM characters.
160 // See <https://bugs.webkit.org/show_bug.cgi?id=4931> for details.
161 if (source
.provider()->hasBOMs()) {
162 for (const UChar
* p
= m_codeStart
; p
< m_codeEnd
; ++p
) {
163 if (UNLIKELY(*p
== byteOrderMark
)) {
164 copyCodeWithoutBOMs();
170 // Read the first characters into the 4-character buffer.
172 ASSERT(currentOffset() == source
.startOffset());
175 void Lexer::copyCodeWithoutBOMs()
177 // Note: In this case, the character offset data for debugging will be incorrect.
178 // If it's important to correctly debug code with extraneous BOMs, then the caller
179 // should strip the BOMs when creating the SourceProvider object and do its own
180 // mapping of offsets within the stripped text to original text offset.
182 m_codeWithoutBOMs
.reserveCapacity(m_codeEnd
- m_code
);
183 for (const UChar
* p
= m_code
; p
< m_codeEnd
; ++p
) {
185 if (c
!= byteOrderMark
)
186 m_codeWithoutBOMs
.append(c
);
188 ptrdiff_t startDelta
= m_codeStart
- m_code
;
189 m_code
= m_codeWithoutBOMs
.data();
190 m_codeStart
= m_code
+ startDelta
;
191 m_codeEnd
= m_codeWithoutBOMs
.data() + m_codeWithoutBOMs
.size();
194 void Lexer::shiftLineTerminator()
196 ASSERT(isLineTerminator(m_current
));
198 // Allow both CRLF and LFCR.
199 if (m_current
+ m_next1
== '\n' + '\r')
207 ALWAYS_INLINE Identifier
* Lexer::makeIdentifier(const UChar
* characters
, size_t length
)
209 m_identifiers
.append(Identifier(m_globalData
, characters
, length
));
210 return &m_identifiers
.last();
213 inline bool Lexer::lastTokenWasRestrKeyword() const
215 return m_lastToken
== CONTINUE
|| m_lastToken
== BREAK
|| m_lastToken
== RETURN
|| m_lastToken
== THROW
;
218 static NEVER_INLINE
bool isNonASCIIIdentStart(int c
)
220 return category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
);
223 static inline bool isIdentStart(int c
)
225 return isASCII(c
) ? isASCIIAlpha(c
) || c
== '$' || c
== '_' : isNonASCIIIdentStart(c
);
228 static NEVER_INLINE
bool isNonASCIIIdentPart(int c
)
230 return category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
231 | Mark_NonSpacing
| Mark_SpacingCombining
| Number_DecimalDigit
| Punctuation_Connector
);
234 static inline bool isIdentPart(int c
)
236 return isASCII(c
) ? isASCIIAlphanumeric(c
) || c
== '$' || c
== '_' : isNonASCIIIdentPart(c
);
239 static inline int singleEscape(int c
)
259 inline void Lexer::record8(int c
)
263 m_buffer8
.append(static_cast<char>(c
));
266 inline void Lexer::record16(UChar c
)
268 m_buffer16
.append(c
);
271 inline void Lexer::record16(int c
)
274 ASSERT(c
<= USHRT_MAX
);
275 record16(UChar(static_cast<unsigned short>(c
)));
278 int Lexer::lex(void* p1
, void* p2
)
281 ASSERT(m_buffer8
.isEmpty());
282 ASSERT(m_buffer16
.isEmpty());
284 YYSTYPE
* lvalp
= static_cast<YYSTYPE
*>(p1
);
285 YYLTYPE
* llocp
= static_cast<YYLTYPE
*>(p2
);
287 m_terminator
= false;
290 while (isWhiteSpace(m_current
))
293 int startOffset
= currentOffset();
295 if (m_current
== -1) {
296 if (!m_terminator
&& !m_delimited
&& !m_isReparsing
) {
297 // automatic semicolon insertion if program incomplete
307 if (m_next1
== '>' && m_next2
== '>') {
308 if (m_next3
== '=') {
310 token
= URSHIFTEQUAL
;
317 if (m_next1
== '>') {
318 if (m_next2
== '=') {
327 if (m_next1
== '=') {
336 if (m_next1
== '=') {
337 if (m_next2
== '=') {
350 if (m_next1
== '=') {
351 if (m_next2
== '=') {
364 if (m_next1
== '!' && m_next2
== '-' && m_next3
== '-') {
365 // <!-- marks the beginning of a line comment (for www usage)
367 goto inSingleLineComment
;
369 if (m_next1
== '<') {
370 if (m_next2
== '=') {
379 if (m_next1
== '=') {
388 if (m_next1
== '+') {
391 token
= AUTOPLUSPLUS
;
397 if (m_next1
== '=') {
406 if (m_next1
== '-') {
407 if (m_atLineStart
&& m_next2
== '>') {
409 goto inSingleLineComment
;
413 token
= AUTOMINUSMINUS
;
419 if (m_next1
== '=') {
428 if (m_next1
== '=') {
437 if (m_next1
== '/') {
439 goto inSingleLineComment
;
442 goto inMultiLineComment
;
443 if (m_next1
== '=') {
452 if (m_next1
== '&') {
457 if (m_next1
== '=') {
466 if (m_next1
== '=') {
475 if (m_next1
== '=') {
484 if (m_next1
== '=') {
489 if (m_next1
== '|') {
498 if (isASCIIDigit(m_next1
)) {
501 goto inNumberAfterDecimalPoint
;
523 lvalp
->intValue
= currentOffset();
528 lvalp
->intValue
= currentOffset();
534 goto startIdentifierWithBackslash
;
536 goto startNumberWithZeroDigit
;
551 if (isIdentStart(m_current
))
552 goto startIdentifierOrKeyword
;
553 if (isLineTerminator(m_current
)) {
554 shiftLineTerminator();
555 m_atLineStart
= true;
557 if (lastTokenWasRestrKeyword()) {
566 m_atLineStart
= false;
570 int stringQuoteCharacter
= m_current
;
573 const UChar
* stringStart
= currentCharacter();
574 while (m_current
!= stringQuoteCharacter
) {
575 // Fast check for characters that require special handling.
576 // Catches -1, \n, \r, \, 0x2028, and 0x2029 as efficiently
577 // as possible, and lets through all common ASCII characters.
578 if (UNLIKELY(m_current
== '\\') || UNLIKELY(((static_cast<unsigned>(m_current
) - 0xE) & 0x2000))) {
579 m_buffer16
.append(stringStart
, currentCharacter() - stringStart
);
584 lvalp
->ident
= makeIdentifier(stringStart
, currentCharacter() - stringStart
);
586 m_atLineStart
= false;
592 while (m_current
!= stringQuoteCharacter
) {
593 if (m_current
== '\\')
594 goto inStringEscapeSequence
;
595 if (UNLIKELY(isLineTerminator(m_current
)))
597 if (UNLIKELY(m_current
== -1))
604 inStringEscapeSequence
:
606 if (m_current
== 'x') {
608 if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
)) {
609 record16(convertHex(m_current
, m_next1
));
614 if (m_current
== stringQuoteCharacter
)
618 if (m_current
== 'u') {
620 if (isASCIIHexDigit(m_current
) && isASCIIHexDigit(m_next1
) && isASCIIHexDigit(m_next2
) && isASCIIHexDigit(m_next3
)) {
621 record16(convertUnicode(m_current
, m_next1
, m_next2
, m_next3
));
625 if (m_current
== stringQuoteCharacter
) {
631 if (isASCIIOctalDigit(m_current
)) {
632 if (m_current
>= '0' && m_current
<= '3' && isASCIIOctalDigit(m_next1
) && isASCIIOctalDigit(m_next2
)) {
633 record16((m_current
- '0') * 64 + (m_next1
- '0') * 8 + m_next2
- '0');
637 if (isASCIIOctalDigit(m_next1
)) {
638 record16((m_current
- '0') * 8 + m_next1
- '0');
642 record16(m_current
- '0');
646 if (isLineTerminator(m_current
)) {
647 shiftLineTerminator();
650 record16(singleEscape(m_current
));
655 startIdentifierWithBackslash
:
657 if (UNLIKELY(m_current
!= 'u'))
660 if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
)))
662 token
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
);
663 if (UNLIKELY(!isIdentStart(token
)))
665 goto inIdentifierAfterCharacterCheck
;
667 startIdentifierOrKeyword
: {
668 const UChar
* identifierStart
= currentCharacter();
670 while (isIdentPart(m_current
))
672 if (LIKELY(m_current
!= '\\')) {
673 lvalp
->ident
= makeIdentifier(identifierStart
, currentCharacter() - identifierStart
);
674 goto doneIdentifierOrKeyword
;
676 m_buffer16
.append(identifierStart
, currentCharacter() - identifierStart
);
681 if (UNLIKELY(m_current
!= 'u'))
684 if (UNLIKELY(!isASCIIHexDigit(m_current
) || !isASCIIHexDigit(m_next1
) || !isASCIIHexDigit(m_next2
) || !isASCIIHexDigit(m_next3
)))
686 token
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
);
687 if (UNLIKELY(!isIdentPart(token
)))
689 inIdentifierAfterCharacterCheck
:
693 while (isIdentPart(m_current
)) {
697 } while (UNLIKELY(m_current
== '\\'));
701 while (!isLineTerminator(m_current
)) {
702 if (UNLIKELY(m_current
== -1))
706 shiftLineTerminator();
707 m_atLineStart
= true;
709 if (lastTokenWasRestrKeyword())
715 while (m_current
!= '*' || m_next1
!= '/') {
716 if (isLineTerminator(m_current
))
717 shiftLineTerminator();
720 if (UNLIKELY(m_current
== -1))
725 m_atLineStart
= false;
728 startNumberWithZeroDigit
:
730 if ((m_current
| 0x20) == 'x' && isASCIIHexDigit(m_next1
)) {
734 if (m_current
== '.') {
738 goto inNumberAfterDecimalPoint
;
740 if ((m_current
| 0x20) == 'e') {
744 goto inExponentIndicator
;
746 if (isASCIIOctalDigit(m_current
))
748 if (isASCIIDigit(m_current
))
750 lvalp
->doubleValue
= 0;
753 inNumberAfterDecimalPoint
:
754 while (isASCIIDigit(m_current
)) {
758 if ((m_current
| 0x20) == 'e') {
761 goto inExponentIndicator
;
766 if (m_current
== '+' || m_current
== '-') {
770 if (!isASCIIDigit(m_current
))
775 } while (isASCIIDigit(m_current
));
782 } while (isASCIIOctalDigit(m_current
));
783 if (isASCIIDigit(m_current
))
788 const char* end
= m_buffer8
.end();
789 for (const char* p
= m_buffer8
.data(); p
< end
; ++p
) {
793 if (dval
>= mantissaOverflowLowerBound
)
794 dval
= parseIntOverflow(m_buffer8
.data(), end
- m_buffer8
.data(), 8);
798 lvalp
->doubleValue
= dval
;
806 } while (isASCIIHexDigit(m_current
));
810 const char* end
= m_buffer8
.end();
811 for (const char* p
= m_buffer8
.data(); p
< end
; ++p
) {
813 dval
+= toASCIIHexValue(*p
);
815 if (dval
>= mantissaOverflowLowerBound
)
816 dval
= parseIntOverflow(m_buffer8
.data(), end
- m_buffer8
.data(), 16);
820 lvalp
->doubleValue
= dval
;
827 while (isASCIIDigit(m_current
)) {
831 if (m_current
== '.') {
834 goto inNumberAfterDecimalPoint
;
836 if ((m_current
| 0x20) == 'e') {
839 goto inExponentIndicator
;
842 // Fall through into doneNumber.
845 // Null-terminate string for strtod.
846 m_buffer8
.append('\0');
847 lvalp
->doubleValue
= WTF::strtod(m_buffer8
.data(), 0);
850 // Fall through into doneNumeric.
853 // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
854 if (UNLIKELY(isIdentStart(m_current
)))
857 m_atLineStart
= false;
868 m_atLineStart
= false;
870 lvalp
->ident
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
871 m_buffer16
.resize(0);
875 doneIdentifierOrKeyword
: {
876 m_atLineStart
= false;
878 m_buffer16
.resize(0);
879 const HashEntry
* entry
= m_keywordTable
.entry(m_globalData
, *lvalp
->ident
);
880 token
= entry
? entry
->lexerValue() : IDENT
;
885 // Atomize constant strings in case they're later used in property lookup.
887 m_atLineStart
= false;
889 lvalp
->ident
= makeIdentifier(m_buffer16
.data(), m_buffer16
.size());
890 m_buffer16
.resize(0);
893 // Fall through into returnToken.
896 int lineNumber
= m_lineNumber
;
897 llocp
->first_line
= lineNumber
;
898 llocp
->last_line
= lineNumber
;
899 llocp
->first_column
= startOffset
;
900 llocp
->last_column
= currentOffset();
911 bool Lexer::scanRegExp()
913 ASSERT(m_buffer16
.isEmpty());
915 bool lastWasEscape
= false;
916 bool inBrackets
= false;
919 if (isLineTerminator(m_current
) || m_current
== -1)
921 if (m_current
!= '/' || lastWasEscape
|| inBrackets
) {
922 // keep track of '[' and ']'
923 if (!lastWasEscape
) {
924 if (m_current
== '[' && !inBrackets
)
926 if (m_current
== ']' && inBrackets
)
930 lastWasEscape
= !lastWasEscape
&& m_current
== '\\';
931 } else { // end of regexp
932 m_pattern
= UString(m_buffer16
);
933 m_buffer16
.resize(0);
940 while (isIdentPart(m_current
)) {
944 m_flags
= UString(m_buffer16
);
945 m_buffer16
.resize(0);
952 m_identifiers
.clear();
953 m_codeWithoutBOMs
.clear();
955 Vector
<char> newBuffer8
;
956 newBuffer8
.reserveInitialCapacity(initialReadBufferCapacity
);
957 m_buffer8
.swap(newBuffer8
);
959 Vector
<UChar
> newBuffer16
;
960 newBuffer16
.reserveInitialCapacity(initialReadBufferCapacity
);
961 m_buffer16
.swap(newBuffer16
);
963 m_isReparsing
= false;
965 m_pattern
= UString();
969 SourceCode
Lexer::sourceCode(int openBrace
, int closeBrace
, int firstLine
)
971 if (m_codeWithoutBOMs
.isEmpty())
972 return SourceCode(m_source
->provider(), openBrace
, closeBrace
+ 1, firstLine
);
974 const UChar
* data
= m_source
->provider()->data();
976 ASSERT(openBrace
< closeBrace
);
978 int numBOMsBeforeOpenBrace
= 0;
979 int numBOMsBetweenBraces
= 0;
982 for (i
= m_source
->startOffset(); i
< openBrace
; ++i
)
983 numBOMsBeforeOpenBrace
+= data
[i
] == byteOrderMark
;
984 for (; i
< closeBrace
; ++i
)
985 numBOMsBetweenBraces
+= data
[i
] == byteOrderMark
;
987 return SourceCode(m_source
->provider(), openBrace
+ numBOMsBeforeOpenBrace
,
988 closeBrace
+ numBOMsBeforeOpenBrace
+ numBOMsBetweenBraces
+ 1, firstLine
);