1 // -*- c-basic-offset: 2 -*-
3 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
4 * Copyright (C) 2006, 2007, 2008 Apple Inc. All Rights Reserved.
5 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
34 #include <wtf/Assertions.h>
35 #include <wtf/unicode/Unicode.h>
38 using namespace Unicode
;
40 // we can't specify the namespace in yacc's C output, so do it here
48 #include "lexer.lut.h"
50 extern YYLTYPE kjsyylloc
; // global bison variable holding token info
52 // a bridge for yacc from the C world to C++
60 static bool isDecimalDigit(int);
62 static const size_t initialReadBufferCapacity
= 32;
63 static const size_t initialStringTableCapacity
= 64;
67 ASSERT(JSLock::currentThreadIsHoldingLock());
69 // FIXME: We'd like to avoid calling new here, but we don't currently
70 // support tearing down the Lexer at app quit time, since that would involve
71 // tearing down its UString data members without holding the JSLock.
72 static Lexer
* staticLexer
= new Lexer
;
79 , eatNextIdentifier(false)
95 m_buffer8
.reserveCapacity(initialReadBufferCapacity
);
96 m_buffer16
.reserveCapacity(initialReadBufferCapacity
);
97 m_strings
.reserveCapacity(initialStringTableCapacity
);
98 m_identifiers
.reserveCapacity(initialStringTableCapacity
);
101 void Lexer::setCode(const SourceCode
& source
)
103 yylineno
= source
.firstLine();
104 restrKeyword
= false;
106 eatNextIdentifier
= false;
111 code
= source
.provider()->data() + source
.startOffset();
112 length
= source
.length();
118 // read first characters
122 void Lexer::shift(unsigned p
)
124 // ECMA-262 calls for stripping Cf characters here, but we only do this for BOM,
125 // see <https://bugs.webkit.org/show_bug.cgi?id=4931>.
131 m_currentOffset
= m_nextOffset1
;
132 m_nextOffset1
= m_nextOffset2
;
133 m_nextOffset2
= m_nextOffset3
;
142 next3
= code
[pos
++].uc
;
143 } while (next3
== 0xFEFF);
147 // called on each new line
148 void Lexer::nextLine()
154 void Lexer::setDone(State s
)
164 unsigned short stringType
= 0; // either single or double quotes
172 // did we push a token on the stack previously ?
173 // (after an automatic semicolon insertion)
174 if (stackToken
>= 0) {
180 int startOffset
= m_currentOffset
;
182 if (skipLF
&& current
!= '\n') // found \r but not \n afterwards
184 if (skipCR
&& current
!= '\r') // found \n but not \r afterwards
186 if (skipLF
|| skipCR
) // found \r\n or \n\r -> eat the second one
194 startOffset
= m_currentOffset
;
195 if (isWhiteSpace()) {
197 } else if (current
== '/' && next1
== '/') {
199 state
= InSingleLineComment
;
200 } else if (current
== '/' && next1
== '*') {
202 state
= InMultiLineComment
;
203 } else if (current
== -1) {
204 if (!terminator
&& !delimited
) {
205 // automatic semicolon insertion if program incomplete
211 } else if (isLineTerminator()) {
218 } else if (current
== '"' || current
== '\'') {
220 stringType
= static_cast<unsigned short>(current
);
221 } else if (isIdentStart(current
)) {
223 state
= InIdentifierOrKeyword
;
224 } else if (current
== '\\') {
225 state
= InIdentifierStartUnicodeEscapeStart
;
226 } else if (current
== '0') {
229 } else if (isDecimalDigit(current
)) {
232 } else if (current
== '.' && isDecimalDigit(next1
)) {
235 // <!-- marks the beginning of a line comment (for www usage)
236 } else if (current
== '<' && next1
== '!' &&
237 next2
== '-' && next3
== '-') {
239 state
= InSingleLineComment
;
241 } else if (atLineStart
&& current
== '-' && next1
== '-' && next2
== '>') {
243 state
= InSingleLineComment
;
245 token
= matchPunctuator(kjsyylval
.intValue
, current
, next1
, next2
, next3
);
249 // cerr << "encountered unknown character" << endl;
255 if (current
== stringType
) {
258 } else if (isLineTerminator() || current
== -1) {
260 } else if (current
== '\\') {
261 state
= InEscapeSequence
;
266 // Escape Sequences inside of strings
267 case InEscapeSequence
:
268 if (isOctalDigit(current
)) {
269 if (current
>= '0' && current
<= '3' &&
270 isOctalDigit(next1
) && isOctalDigit(next2
)) {
271 record16(convertOctal(current
, next1
, next2
));
274 } else if (isOctalDigit(current
) && isOctalDigit(next1
)) {
275 record16(convertOctal('0', current
, next1
));
278 } else if (isOctalDigit(current
)) {
279 record16(convertOctal('0', '0', current
));
284 } else if (current
== 'x')
286 else if (current
== 'u')
287 state
= InUnicodeEscape
;
288 else if (isLineTerminator()) {
292 record16(singleEscape(static_cast<unsigned short>(current
)));
297 if (isHexDigit(current
) && isHexDigit(next1
)) {
299 record16(convertHex(current
, next1
));
301 } else if (current
== stringType
) {
311 case InUnicodeEscape
:
312 if (isHexDigit(current
) && isHexDigit(next1
) && isHexDigit(next2
) && isHexDigit(next3
)) {
313 record16(convertUnicode(current
, next1
, next2
, next3
));
316 } else if (current
== stringType
) {
324 case InSingleLineComment
:
325 if (isLineTerminator()) {
333 } else if (current
== -1) {
337 case InMultiLineComment
:
340 } else if (isLineTerminator()) {
342 } else if (current
== '*' && next1
== '/') {
347 case InIdentifierOrKeyword
:
349 if (isIdentPart(current
))
351 else if (current
== '\\')
352 state
= InIdentifierPartUnicodeEscapeStart
;
354 setDone(state
== InIdentifierOrKeyword
? IdentifierOrKeyword
: Identifier
);
357 if (current
== 'x' || current
== 'X') {
360 } else if (current
== '.') {
363 } else if (current
== 'e' || current
== 'E') {
365 state
= InExponentIndicator
;
366 } else if (isOctalDigit(current
)) {
369 } else if (isDecimalDigit(current
)) {
377 if (isHexDigit(current
)) {
384 if (isOctalDigit(current
)) {
387 else if (isDecimalDigit(current
)) {
394 if (isDecimalDigit(current
)) {
396 } else if (current
== '.') {
399 } else if (current
== 'e' || current
== 'E') {
401 state
= InExponentIndicator
;
406 if (isDecimalDigit(current
)) {
408 } else if (current
== 'e' || current
== 'E') {
410 state
= InExponentIndicator
;
414 case InExponentIndicator
:
415 if (current
== '+' || current
== '-') {
417 } else if (isDecimalDigit(current
)) {
424 if (isDecimalDigit(current
)) {
429 case InIdentifierStartUnicodeEscapeStart
:
431 state
= InIdentifierStartUnicodeEscape
;
435 case InIdentifierPartUnicodeEscapeStart
:
437 state
= InIdentifierPartUnicodeEscape
;
441 case InIdentifierStartUnicodeEscape
:
442 if (!isHexDigit(current
) || !isHexDigit(next1
) || !isHexDigit(next2
) || !isHexDigit(next3
)) {
446 token
= convertUnicode(current
, next1
, next2
, next3
).uc
;
448 if (!isIdentStart(token
)) {
453 state
= InIdentifier
;
455 case InIdentifierPartUnicodeEscape
:
456 if (!isHexDigit(current
) || !isHexDigit(next1
) || !isHexDigit(next2
) || !isHexDigit(next3
)) {
460 token
= convertUnicode(current
, next1
, next2
, next3
).uc
;
462 if (!isIdentPart(token
)) {
467 state
= InIdentifier
;
470 ASSERT(!"Unhandled state in switch statement");
473 // move on to the next character
476 if (state
!= Start
&& state
!= InSingleLineComment
)
480 // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
481 if ((state
== Number
|| state
== Octal
|| state
== Hex
) && isIdentStart(current
))
485 m_buffer8
.append('\0');
488 fprintf(stderr
, "line: %d ", lineNo());
489 fprintf(stderr
, "yytext (%x): ", m_buffer8
[0]);
490 fprintf(stderr
, "%s ", buffer8
.data());
494 if (state
== Number
) {
495 dval
= kjs_strtod(m_buffer8
.data(), 0L);
496 } else if (state
== Hex
) { // scan hex numbers
497 const char* p
= m_buffer8
.data() + 2;
498 while (char c
= *p
++) {
500 dval
+= convertHex(c
);
503 if (dval
>= mantissaOverflowLowerBound
)
504 dval
= parseIntOverflow(m_buffer8
.data() + 2, p
- (m_buffer8
.data() + 3), 16);
507 } else if (state
== Octal
) { // scan octal number
508 const char* p
= m_buffer8
.data() + 1;
509 while (char c
= *p
++) {
514 if (dval
>= mantissaOverflowLowerBound
)
515 dval
= parseIntOverflow(m_buffer8
.data() + 1, p
- (m_buffer8
.data() + 2), 8);
529 printf("(Identifier)/(Keyword)\n");
532 printf("(String)\n");
535 printf("(Number)\n");
542 if (state
!= Identifier
&& eatNextIdentifier
)
543 eatNextIdentifier
= false;
545 restrKeyword
= false;
547 kjsyylloc
.first_line
= yylineno
; // ???
548 kjsyylloc
.last_line
= yylineno
;
555 if(token
== '}' || token
== ';') {
559 case IdentifierOrKeyword
:
560 if ((token
= Lookup::find(&mainTable
, m_buffer16
.data(), m_buffer16
.size())) < 0) {
562 // Lookup for keyword failed, means this is an identifier
563 // Apply anonymous-function hack below (eat the identifier)
564 if (eatNextIdentifier
) {
565 eatNextIdentifier
= false;
569 kjsyylval
.ident
= makeIdentifier(m_buffer16
);
574 eatNextIdentifier
= false;
575 // Hack for "f = function somename() { ... }", too hard to get into the grammar
576 if (token
== FUNCTION
&& lastToken
== '=' )
577 eatNextIdentifier
= true;
579 if (token
== CONTINUE
|| token
== BREAK
||
580 token
== RETURN
|| token
== THROW
)
584 kjsyylval
.string
= makeUString(m_buffer16
);
588 kjsyylval
.doubleValue
= dval
;
593 fprintf(stderr
, "yylex: ERROR.\n");
598 ASSERT(!"unhandled numeration value in switch");
606 bool Lexer::isWhiteSpace() const
608 return current
== '\t' || current
== 0x0b || current
== 0x0c || isSeparatorSpace(current
);
611 bool Lexer::isLineTerminator()
613 bool cr
= (current
== '\r');
614 bool lf
= (current
== '\n');
619 return cr
|| lf
|| current
== 0x2028 || current
== 0x2029;
622 bool Lexer::isIdentStart(int c
)
624 return (category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
))
625 || c
== '$' || c
== '_';
628 bool Lexer::isIdentPart(int c
)
630 return (category(c
) & (Letter_Uppercase
| Letter_Lowercase
| Letter_Titlecase
| Letter_Modifier
| Letter_Other
631 | Mark_NonSpacing
| Mark_SpacingCombining
| Number_DecimalDigit
| Punctuation_Connector
))
632 || c
== '$' || c
== '_';
635 static bool isDecimalDigit(int c
)
637 return (c
>= '0' && c
<= '9');
640 bool Lexer::isHexDigit(int c
)
642 return (c
>= '0' && c
<= '9' ||
643 c
>= 'a' && c
<= 'f' ||
644 c
>= 'A' && c
<= 'F');
647 bool Lexer::isOctalDigit(int c
)
649 return (c
>= '0' && c
<= '7');
652 int Lexer::matchPunctuator(int& charPos
, int c1
, int c2
, int c3
, int c4
)
654 if (c1
== '>' && c2
== '>' && c3
== '>' && c4
== '=') {
657 } else if (c1
== '=' && c2
== '=' && c3
== '=') {
660 } else if (c1
== '!' && c2
== '=' && c3
== '=') {
663 } else if (c1
== '>' && c2
== '>' && c3
== '>') {
666 } else if (c1
== '<' && c2
== '<' && c3
== '=') {
669 } else if (c1
== '>' && c2
== '>' && c3
== '=') {
672 } else if (c1
== '<' && c2
== '=') {
675 } else if (c1
== '>' && c2
== '=') {
678 } else if (c1
== '!' && c2
== '=') {
681 } else if (c1
== '+' && c2
== '+') {
687 } else if (c1
== '-' && c2
== '-') {
690 return AUTOMINUSMINUS
;
693 } else if (c1
== '=' && c2
== '=') {
696 } else if (c1
== '+' && c2
== '=') {
699 } else if (c1
== '-' && c2
== '=') {
702 } else if (c1
== '*' && c2
== '=') {
705 } else if (c1
== '/' && c2
== '=') {
708 } else if (c1
== '&' && c2
== '=') {
711 } else if (c1
== '^' && c2
== '=') {
714 } else if (c1
== '%' && c2
== '=') {
717 } else if (c1
== '|' && c2
== '=') {
720 } else if (c1
== '<' && c2
== '<') {
723 } else if (c1
== '>' && c2
== '>') {
726 } else if (c1
== '&' && c2
== '&') {
729 } else if (c1
== '|' && c2
== '|') {
758 return static_cast<int>(c1
);
772 unsigned short Lexer::singleEscape(unsigned short c
)
798 unsigned short Lexer::convertOctal(int c1
, int c2
, int c3
)
800 return static_cast<unsigned short>((c1
- '0') * 64 + (c2
- '0') * 8 + c3
- '0');
803 unsigned char Lexer::convertHex(int c
)
805 if (c
>= '0' && c
<= '9')
806 return static_cast<unsigned char>(c
- '0');
807 if (c
>= 'a' && c
<= 'f')
808 return static_cast<unsigned char>(c
- 'a' + 10);
809 return static_cast<unsigned char>(c
- 'A' + 10);
812 unsigned char Lexer::convertHex(int c1
, int c2
)
814 return ((convertHex(c1
) << 4) + convertHex(c2
));
817 KJS::UChar
Lexer::convertUnicode(int c1
, int c2
, int c3
, int c4
)
819 return KJS::UChar((convertHex(c1
) << 4) + convertHex(c2
),
820 (convertHex(c3
) << 4) + convertHex(c4
));
823 void Lexer::record8(int c
)
827 m_buffer8
.append(static_cast<char>(c
));
830 void Lexer::record16(int c
)
833 ASSERT(c
<= USHRT_MAX
);
834 record16(UChar(static_cast<unsigned short>(c
)));
837 void Lexer::record16(KJS::UChar c
)
839 m_buffer16
.append(c
);
842 bool Lexer::scanRegExp()
845 bool lastWasEscape
= false;
846 bool inBrackets
= false;
849 if (isLineTerminator() || current
== -1)
851 else if (current
!= '/' || lastWasEscape
== true || inBrackets
== true)
853 // keep track of '[' and ']'
854 if (!lastWasEscape
) {
855 if ( current
== '[' && !inBrackets
)
857 if ( current
== ']' && inBrackets
)
862 !lastWasEscape
&& (current
== '\\');
863 } else { // end of regexp
864 m_pattern
= UString(m_buffer16
);
872 while (isIdentPart(current
)) {
876 m_flags
= UString(m_buffer16
);
883 deleteAllValues(m_strings
);
884 Vector
<UString
*> newStrings
;
885 newStrings
.reserveCapacity(initialStringTableCapacity
);
886 m_strings
.swap(newStrings
);
888 deleteAllValues(m_identifiers
);
889 Vector
<KJS::Identifier
*> newIdentifiers
;
890 newIdentifiers
.reserveCapacity(initialStringTableCapacity
);
891 m_identifiers
.swap(newIdentifiers
);
893 Vector
<char> newBuffer8
;
894 newBuffer8
.reserveCapacity(initialReadBufferCapacity
);
895 m_buffer8
.swap(newBuffer8
);
897 Vector
<UChar
> newBuffer16
;
898 newBuffer16
.reserveCapacity(initialReadBufferCapacity
);
899 m_buffer16
.swap(newBuffer16
);
905 Identifier
* Lexer::makeIdentifier(const Vector
<KJS::UChar
>& buffer
)
907 KJS::Identifier
* identifier
= new KJS::Identifier(buffer
.data(), buffer
.size());
908 m_identifiers
.append(identifier
);
912 UString
* Lexer::makeUString(const Vector
<KJS::UChar
>& buffer
)
914 UString
* string
= new UString(buffer
);
915 m_strings
.append(string
);