2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
26 #include "JSFunction.h"
27 #include "JSGlobalObjectFunctions.h"
34 #include <wtf/ASCIICType.h>
35 #include <wtf/Assertions.h>
36 #include <wtf/unicode/Unicode.h>
39 using namespace Unicode
;
41 // we can't specify the namespace in yacc's C output, so do it here
49 #include "Lexer.lut.h"
51 // a bridge for yacc from the C world to C++
52 int jscyylex(void* lvalp
, void* llocp
, void* globalData
)
54 return static_cast<JSGlobalData
*>(globalData
)->lexer
->lex(lvalp
, llocp
);
59 static bool isDecimalDigit(int);
61 Lexer::Lexer(JSGlobalData
* globalData
)
63 , m_restrKeyword(false)
64 , m_eatNextIdentifier(false)
70 , m_isReparsing(false)
80 , m_globalData(globalData
)
81 , m_mainTable(JSC::mainTable
)
83 m_buffer8
.reserveInitialCapacity(initialReadBufferCapacity
);
84 m_buffer16
.reserveInitialCapacity(initialReadBufferCapacity
);
89 m_mainTable
.deleteTable();
92 void Lexer::setCode(const SourceCode
& source
)
94 yylineno
= source
.firstLine();
95 m_restrKeyword
= false;
97 m_eatNextIdentifier
= false;
101 m_position
= source
.startOffset();
103 m_code
= source
.provider()->data();
104 m_length
= source
.endOffset();
108 m_atLineStart
= true;
110 // read first characters
114 void Lexer::shift(unsigned p
)
116 // ECMA-262 calls for stripping Cf characters here, but we only do this for BOM,
117 // see <https://bugs.webkit.org/show_bug.cgi?id=4931>.
123 m_currentOffset
= m_nextOffset1
;
124 m_nextOffset1
= m_nextOffset2
;
125 m_nextOffset2
= m_nextOffset3
;
127 if (m_position
>= m_length
) {
128 m_nextOffset3
= m_position
;
133 m_nextOffset3
= m_position
;
134 m_next3
= m_code
[m_position
++];
135 } while (m_next3
== 0xFEFF);
139 // called on each new line
140 void Lexer::nextLine()
143 m_atLineStart
= true;
146 void Lexer::setDone(State s
)
152 int Lexer::lex(void* p1
, void* p2
)
154 YYSTYPE
* lvalp
= static_cast<YYSTYPE
*>(p1
);
155 YYLTYPE
* llocp
= static_cast<YYLTYPE
*>(p2
);
158 unsigned short stringType
= 0; // either single or double quotes
162 m_terminator
= false;
166 // did we push a token on the stack previously ?
167 // (after an automatic semicolon insertion)
168 if (m_stackToken
>= 0) {
170 token
= m_stackToken
;
173 int startOffset
= m_currentOffset
;
175 if (m_skipLF
&& m_current
!= '\n') // found \r but not \n afterwards
177 if (m_skipCR
&& m_current
!= '\r') // found \n but not \r afterwards
179 if (m_skipLF
|| m_skipCR
) { // found \r\n or \n\r -> eat the second one
186 startOffset
= m_currentOffset
;
187 if (isWhiteSpace()) {
189 } else if (m_current
== '/' && m_next1
== '/') {
191 m_state
= InSingleLineComment
;
192 } else if (m_current
== '/' && m_next1
== '*') {
194 m_state
= InMultiLineComment
;
195 } else if (m_current
== -1) {
196 if (!m_terminator
&& !m_delimited
&& !m_isReparsing
) {
197 // automatic semicolon insertion if program incomplete
203 } else if (isLineTerminator()) {
206 if (m_restrKeyword
) {
210 } else if (m_current
== '"' || m_current
== '\'') {
212 stringType
= static_cast<unsigned short>(m_current
);
213 } else if (isIdentStart(m_current
)) {
215 m_state
= InIdentifierOrKeyword
;
216 } else if (m_current
== '\\')
217 m_state
= InIdentifierStartUnicodeEscapeStart
;
218 else if (m_current
== '0') {
221 } else if (isDecimalDigit(m_current
)) {
224 } else if (m_current
== '.' && isDecimalDigit(m_next1
)) {
227 // <!-- marks the beginning of a line comment (for www usage)
228 } else if (m_current
== '<' && m_next1
== '!' && m_next2
== '-' && m_next3
== '-') {
230 m_state
= InSingleLineComment
;
232 } else if (m_atLineStart
&& m_current
== '-' && m_next1
== '-' && m_next2
== '>') {
234 m_state
= InSingleLineComment
;
236 token
= matchPunctuator(lvalp
->intValue
, m_current
, m_next1
, m_next2
, m_next3
);
244 if (m_current
== stringType
) {
247 } else if (isLineTerminator() || m_current
== -1)
249 else if (m_current
== '\\')
250 m_state
= InEscapeSequence
;
254 // Escape Sequences inside of strings
255 case InEscapeSequence
:
256 if (isOctalDigit(m_current
)) {
257 if (m_current
>= '0' && m_current
<= '3' &&
258 isOctalDigit(m_next1
) && isOctalDigit(m_next2
)) {
259 record16(convertOctal(m_current
, m_next1
, m_next2
));
262 } else if (isOctalDigit(m_current
) && isOctalDigit(m_next1
)) {
263 record16(convertOctal('0', m_current
, m_next1
));
266 } else if (isOctalDigit(m_current
)) {
267 record16(convertOctal('0', '0', m_current
));
271 } else if (m_current
== 'x')
272 m_state
= InHexEscape
;
273 else if (m_current
== 'u')
274 m_state
= InUnicodeEscape
;
275 else if (isLineTerminator()) {
279 record16(singleEscape(static_cast<unsigned short>(m_current
)));
284 if (isHexDigit(m_current
) && isHexDigit(m_next1
)) {
286 record16(convertHex(m_current
, m_next1
));
288 } else if (m_current
== stringType
) {
298 case InUnicodeEscape
:
299 if (isHexDigit(m_current
) && isHexDigit(m_next1
) && isHexDigit(m_next2
) && isHexDigit(m_next3
)) {
300 record16(convertUnicode(m_current
, m_next1
, m_next2
, m_next3
));
303 } else if (m_current
== stringType
) {
310 case InSingleLineComment
:
311 if (isLineTerminator()) {
314 if (m_restrKeyword
) {
319 } else if (m_current
== -1)
322 case InMultiLineComment
:
325 else if (isLineTerminator())
327 else if (m_current
== '*' && m_next1
== '/') {
332 case InIdentifierOrKeyword
:
334 if (isIdentPart(m_current
))
336 else if (m_current
== '\\')
337 m_state
= InIdentifierPartUnicodeEscapeStart
;
339 setDone(m_state
== InIdentifierOrKeyword
? IdentifierOrKeyword
: Identifier
);
342 if (m_current
== 'x' || m_current
== 'X') {
345 } else if (m_current
== '.') {
348 } else if (m_current
== 'e' || m_current
== 'E') {
350 m_state
= InExponentIndicator
;
351 } else if (isOctalDigit(m_current
)) {
354 } else if (isDecimalDigit(m_current
)) {
361 if (isHexDigit(m_current
))
367 if (isOctalDigit(m_current
))
369 else if (isDecimalDigit(m_current
)) {
376 if (isDecimalDigit(m_current
))
378 else if (m_current
== '.') {
381 } else if (m_current
== 'e' || m_current
== 'E') {
383 m_state
= InExponentIndicator
;
388 if (isDecimalDigit(m_current
))
390 else if (m_current
== 'e' || m_current
== 'E') {
392 m_state
= InExponentIndicator
;
396 case InExponentIndicator
:
397 if (m_current
== '+' || m_current
== '-')
399 else if (isDecimalDigit(m_current
)) {
401 m_state
= InExponent
;
406 if (isDecimalDigit(m_current
))
411 case InIdentifierStartUnicodeEscapeStart
:
412 if (m_current
== 'u')
413 m_state
= InIdentifierStartUnicodeEscape
;
417 case InIdentifierPartUnicodeEscapeStart
:
418 if (m_current
== 'u')
419 m_state
= InIdentifierPartUnicodeEscape
;
423 case InIdentifierStartUnicodeEscape
:
424 if (!isHexDigit(m_current
) || !isHexDigit(m_next1
) || !isHexDigit(m_next2
) || !isHexDigit(m_next3
)) {
428 token
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
);
430 if (!isIdentStart(token
)) {
435 m_state
= InIdentifier
;
437 case InIdentifierPartUnicodeEscape
:
438 if (!isHexDigit(m_current
) || !isHexDigit(m_next1
) || !isHexDigit(m_next2
) || !isHexDigit(m_next3
)) {
442 token
= convertUnicode(m_current
, m_next1
, m_next2
, m_next3
);
444 if (!isIdentPart(token
)) {
449 m_state
= InIdentifier
;
452 ASSERT(!"Unhandled state in switch statement");
455 // move on to the next character
458 if (m_state
!= Start
&& m_state
!= InSingleLineComment
)
459 m_atLineStart
= false;
462 // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
463 if ((m_state
== Number
|| m_state
== Octal
|| m_state
== Hex
) && isIdentStart(m_current
))
467 m_buffer8
.append('\0');
470 fprintf(stderr
, "line: %d ", lineNo());
471 fprintf(stderr
, "yytext (%x): ", m_buffer8
[0]);
472 fprintf(stderr
, "%s ", m_buffer8
.data());
476 if (m_state
== Number
)
477 dval
= WTF::strtod(m_buffer8
.data(), 0L);
478 else if (m_state
== Hex
) { // scan hex numbers
479 const char* p
= m_buffer8
.data() + 2;
480 while (char c
= *p
++) {
482 dval
+= convertHex(c
);
485 if (dval
>= mantissaOverflowLowerBound
)
486 dval
= parseIntOverflow(m_buffer8
.data() + 2, p
- (m_buffer8
.data() + 3), 16);
489 } else if (m_state
== Octal
) { // scan octal number
490 const char* p
= m_buffer8
.data() + 1;
491 while (char c
= *p
++) {
496 if (dval
>= mantissaOverflowLowerBound
)
497 dval
= parseIntOverflow(m_buffer8
.data() + 1, p
- (m_buffer8
.data() + 2), 8);
511 printf("(Identifier)/(Keyword)\n");
514 printf("(String)\n");
517 printf("(Number)\n");
524 if (m_state
!= Identifier
)
525 m_eatNextIdentifier
= false;
527 m_restrKeyword
= false;
529 llocp
->first_line
= yylineno
;
530 llocp
->last_line
= yylineno
;
531 llocp
->first_column
= startOffset
;
532 llocp
->last_column
= m_currentOffset
;
538 if (token
== '}' || token
== ';')
542 // Apply anonymous-function hack below (eat the identifier).
543 if (m_eatNextIdentifier
) {
544 m_eatNextIdentifier
= false;
545 token
= lex(lvalp
, llocp
);
548 lvalp
->ident
= makeIdentifier(m_buffer16
);
551 case IdentifierOrKeyword
: {
552 lvalp
->ident
= makeIdentifier(m_buffer16
);
553 const HashEntry
* entry
= m_mainTable
.entry(m_globalData
, *lvalp
->ident
);
555 // Lookup for keyword failed, means this is an identifier.
559 token
= entry
->lexerValue();
560 // Hack for "f = function somename() { ... }"; too hard to get into the grammar.
561 m_eatNextIdentifier
= token
== FUNCTION
&& m_lastToken
== '=';
562 if (token
== CONTINUE
|| token
== BREAK
|| token
== RETURN
|| token
== THROW
)
563 m_restrKeyword
= true;
567 // Atomize constant strings in case they're later used in property lookup.
568 lvalp
->ident
= makeIdentifier(m_buffer16
);
572 lvalp
->doubleValue
= dval
;
577 fprintf(stderr
, "yylex: ERROR.\n");
582 ASSERT(!"unhandled numeration value in switch");
590 bool Lexer::isWhiteSpace() const
592 return m_current
== '\t' || m_current
== 0x0b || m_current
== 0x0c || isSeparatorSpace(m_current
);
595 bool Lexer::isLineTerminator()
597 bool cr
= (m_current
== '\r');
598 bool lf
= (m_current
== '\n');
603 return cr
|| lf
|| m_current
== 0x2028 || m_current
== 0x2029;
606 bool Lexer::isIdentStart(int c
)
608 return isASCIIAlpha(c
) || c
== '$' || c
== '_' || (!isASCII(c
) && (category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
)));
611 bool Lexer::isIdentPart(int c
)
613 return isASCIIAlphanumeric(c
) || c
== '$' || c
== '_' || (!isASCII(c
) && (category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
614 | Mark_NonSpacing
| Mark_SpacingCombining
| Number_DecimalDigit
| Punctuation_Connector
)));
617 static bool isDecimalDigit(int c
)
619 return isASCIIDigit(c
);
622 bool Lexer::isHexDigit(int c
)
624 return isASCIIHexDigit(c
);
627 bool Lexer::isOctalDigit(int c
)
629 return isASCIIOctalDigit(c
);
632 int Lexer::matchPunctuator(int& charPos
, int c1
, int c2
, int c3
, int c4
)
634 if (c1
== '>' && c2
== '>' && c3
== '>' && c4
== '=') {
638 if (c1
== '=' && c2
== '=' && c3
== '=') {
642 if (c1
== '!' && c2
== '=' && c3
== '=') {
646 if (c1
== '>' && c2
== '>' && c3
== '>') {
650 if (c1
== '<' && c2
== '<' && c3
== '=') {
654 if (c1
== '>' && c2
== '>' && c3
== '=') {
658 if (c1
== '<' && c2
== '=') {
662 if (c1
== '>' && c2
== '=') {
666 if (c1
== '!' && c2
== '=') {
670 if (c1
== '+' && c2
== '+') {
676 if (c1
== '-' && c2
== '-') {
679 return AUTOMINUSMINUS
;
682 if (c1
== '=' && c2
== '=') {
686 if (c1
== '+' && c2
== '=') {
690 if (c1
== '-' && c2
== '=') {
694 if (c1
== '*' && c2
== '=') {
698 if (c1
== '/' && c2
== '=') {
702 if (c1
== '&' && c2
== '=') {
706 if (c1
== '^' && c2
== '=') {
710 if (c1
== '%' && c2
== '=') {
714 if (c1
== '|' && c2
== '=') {
718 if (c1
== '<' && c2
== '<') {
722 if (c1
== '>' && c2
== '>') {
726 if (c1
== '&' && c2
== '&') {
730 if (c1
== '|' && c2
== '|') {
759 return static_cast<int>(c1
);
761 charPos
= m_currentOffset
;
765 charPos
= m_currentOffset
;
773 unsigned short Lexer::singleEscape(unsigned short c
)
799 unsigned short Lexer::convertOctal(int c1
, int c2
, int c3
)
801 return static_cast<unsigned short>((c1
- '0') * 64 + (c2
- '0') * 8 + c3
- '0');
804 unsigned char Lexer::convertHex(int c
)
806 if (c
>= '0' && c
<= '9')
807 return static_cast<unsigned char>(c
- '0');
808 if (c
>= 'a' && c
<= 'f')
809 return static_cast<unsigned char>(c
- 'a' + 10);
810 return static_cast<unsigned char>(c
- 'A' + 10);
813 unsigned char Lexer::convertHex(int c1
, int c2
)
815 return ((convertHex(c1
) << 4) + convertHex(c2
));
818 UChar
Lexer::convertUnicode(int c1
, int c2
, int c3
, int c4
)
820 unsigned char highByte
= (convertHex(c1
) << 4) + convertHex(c2
);
821 unsigned char lowByte
= (convertHex(c3
) << 4) + convertHex(c4
);
822 return (highByte
<< 8 | lowByte
);
825 void Lexer::record8(int c
)
829 m_buffer8
.append(static_cast<char>(c
));
832 void Lexer::record16(int c
)
835 ASSERT(c
<= USHRT_MAX
);
836 record16(UChar(static_cast<unsigned short>(c
)));
839 void Lexer::record16(UChar c
)
841 m_buffer16
.append(c
);
844 bool Lexer::scanRegExp()
847 bool lastWasEscape
= false;
848 bool inBrackets
= false;
851 if (isLineTerminator() || m_current
== -1)
853 else if (m_current
!= '/' || lastWasEscape
== true || inBrackets
== true) {
854 // keep track of '[' and ']'
855 if (!lastWasEscape
) {
856 if ( m_current
== '[' && !inBrackets
)
858 if ( m_current
== ']' && inBrackets
)
863 !lastWasEscape
&& (m_current
== '\\');
864 } else { // end of regexp
865 m_pattern
= UString(m_buffer16
);
873 while (isIdentPart(m_current
)) {
877 m_flags
= UString(m_buffer16
);
884 m_identifiers
.clear();
886 Vector
<char> newBuffer8
;
887 newBuffer8
.reserveInitialCapacity(initialReadBufferCapacity
);
888 m_buffer8
.swap(newBuffer8
);
890 Vector
<UChar
> newBuffer16
;
891 newBuffer16
.reserveInitialCapacity(initialReadBufferCapacity
);
892 m_buffer16
.swap(newBuffer16
);
894 m_isReparsing
= false;