X-Git-Url: https://git.saurik.com/apple/javascriptcore.git/blobdiff_plain/9dae56ea45a0f5f8136a5c93d6f3a7f99399ca73..14957cd040308e3eeec43d26bae5d76da13fcd85:/parser/Lexer.cpp?ds=inline diff --git a/parser/Lexer.cpp b/parser/Lexer.cpp index c2880dc..cae6bb9 100644 --- a/parser/Lexer.cpp +++ b/parser/Lexer.cpp @@ -2,6 +2,7 @@ * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) * Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved. * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) + * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public @@ -24,877 +25,1234 @@ #include "Lexer.h" #include "JSFunction.h" + #include "JSGlobalObjectFunctions.h" +#include "Identifier.h" #include "NodeInfo.h" #include "Nodes.h" #include "dtoa.h" #include #include #include -#include #include -#include using namespace WTF; using namespace Unicode; -// we can't specify the namespace in yacc's C output, so do it here -using namespace JSC; - -#ifndef KDE_USE_FINAL -#include "Grammar.h" -#endif - +#include "JSParser.h" +#include "KeywordLookup.h" #include "Lookup.h" #include "Lexer.lut.h" -// a bridge for yacc from the C world to C++ -int jscyylex(void* lvalp, void* llocp, void* globalData) -{ - return static_cast(globalData)->lexer->lex(lvalp, llocp); -} - namespace JSC { -static bool isDecimalDigit(int); + +enum CharacterType { + // Types for the main switch + + // The first three types are fixed, and also used for identifying + // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart). + CharacterIdentifierStart, + CharacterZero, + CharacterNumber, + + CharacterInvalid, + CharacterLineTerminator, + CharacterExclamationMark, + CharacterOpenParen, + CharacterCloseParen, + CharacterOpenBracket, + CharacterCloseBracket, + CharacterComma, + CharacterColon, + CharacterQuestion, + CharacterTilde, + CharacterQuote, + CharacterDot, + CharacterSlash, + CharacterBackSlash, + CharacterSemicolon, + CharacterOpenBrace, + CharacterCloseBrace, + + CharacterAdd, + CharacterSub, + CharacterMultiply, + CharacterModulo, + CharacterAnd, + CharacterXor, + CharacterOr, + CharacterLess, + CharacterGreater, + CharacterEqual, + + // Other types (only one so far) + CharacterWhiteSpace, +}; + +// 128 ASCII codes +static const unsigned short typesOfASCIICharacters[128] = { +/* 0 - Null */ CharacterInvalid, +/* 1 - Start of Heading */ CharacterInvalid, +/* 2 - Start of Text */ CharacterInvalid, +/* 3 - End of Text */ CharacterInvalid, +/* 4 - End of Transm. */ CharacterInvalid, +/* 5 - Enquiry */ CharacterInvalid, +/* 6 - Acknowledgment */ CharacterInvalid, +/* 7 - Bell */ CharacterInvalid, +/* 8 - Back Space */ CharacterInvalid, +/* 9 - Horizontal Tab */ CharacterWhiteSpace, +/* 10 - Line Feed */ CharacterLineTerminator, +/* 11 - Vertical Tab */ CharacterWhiteSpace, +/* 12 - Form Feed */ CharacterWhiteSpace, +/* 13 - Carriage Return */ CharacterLineTerminator, +/* 14 - Shift Out */ CharacterInvalid, +/* 15 - Shift In */ CharacterInvalid, +/* 16 - Data Line Escape */ CharacterInvalid, +/* 17 - Device Control 1 */ CharacterInvalid, +/* 18 - Device Control 2 */ CharacterInvalid, +/* 19 - Device Control 3 */ CharacterInvalid, +/* 20 - Device Control 4 */ CharacterInvalid, +/* 21 - Negative Ack. */ CharacterInvalid, +/* 22 - Synchronous Idle */ CharacterInvalid, +/* 23 - End of Transmit */ CharacterInvalid, +/* 24 - Cancel */ CharacterInvalid, +/* 25 - End of Medium */ CharacterInvalid, +/* 26 - Substitute */ CharacterInvalid, +/* 27 - Escape */ CharacterInvalid, +/* 28 - File Separator */ CharacterInvalid, +/* 29 - Group Separator */ CharacterInvalid, +/* 30 - Record Separator */ CharacterInvalid, +/* 31 - Unit Separator */ CharacterInvalid, +/* 32 - Space */ CharacterWhiteSpace, +/* 33 - ! */ CharacterExclamationMark, +/* 34 - " */ CharacterQuote, +/* 35 - # */ CharacterInvalid, +/* 36 - $ */ CharacterIdentifierStart, +/* 37 - % */ CharacterModulo, +/* 38 - & */ CharacterAnd, +/* 39 - ' */ CharacterQuote, +/* 40 - ( */ CharacterOpenParen, +/* 41 - ) */ CharacterCloseParen, +/* 42 - * */ CharacterMultiply, +/* 43 - + */ CharacterAdd, +/* 44 - , */ CharacterComma, +/* 45 - - */ CharacterSub, +/* 46 - . */ CharacterDot, +/* 47 - / */ CharacterSlash, +/* 48 - 0 */ CharacterZero, +/* 49 - 1 */ CharacterNumber, +/* 50 - 2 */ CharacterNumber, +/* 51 - 3 */ CharacterNumber, +/* 52 - 4 */ CharacterNumber, +/* 53 - 5 */ CharacterNumber, +/* 54 - 6 */ CharacterNumber, +/* 55 - 7 */ CharacterNumber, +/* 56 - 8 */ CharacterNumber, +/* 57 - 9 */ CharacterNumber, +/* 58 - : */ CharacterColon, +/* 59 - ; */ CharacterSemicolon, +/* 60 - < */ CharacterLess, +/* 61 - = */ CharacterEqual, +/* 62 - > */ CharacterGreater, +/* 63 - ? */ CharacterQuestion, +/* 64 - @ */ CharacterInvalid, +/* 65 - A */ CharacterIdentifierStart, +/* 66 - B */ CharacterIdentifierStart, +/* 67 - C */ CharacterIdentifierStart, +/* 68 - D */ CharacterIdentifierStart, +/* 69 - E */ CharacterIdentifierStart, +/* 70 - F */ CharacterIdentifierStart, +/* 71 - G */ CharacterIdentifierStart, +/* 72 - H */ CharacterIdentifierStart, +/* 73 - I */ CharacterIdentifierStart, +/* 74 - J */ CharacterIdentifierStart, +/* 75 - K */ CharacterIdentifierStart, +/* 76 - L */ CharacterIdentifierStart, +/* 77 - M */ CharacterIdentifierStart, +/* 78 - N */ CharacterIdentifierStart, +/* 79 - O */ CharacterIdentifierStart, +/* 80 - P */ CharacterIdentifierStart, +/* 81 - Q */ CharacterIdentifierStart, +/* 82 - R */ CharacterIdentifierStart, +/* 83 - S */ CharacterIdentifierStart, +/* 84 - T */ CharacterIdentifierStart, +/* 85 - U */ CharacterIdentifierStart, +/* 86 - V */ CharacterIdentifierStart, +/* 87 - W */ CharacterIdentifierStart, +/* 88 - X */ CharacterIdentifierStart, +/* 89 - Y */ CharacterIdentifierStart, +/* 90 - Z */ CharacterIdentifierStart, +/* 91 - [ */ CharacterOpenBracket, +/* 92 - \ */ CharacterBackSlash, +/* 93 - ] */ CharacterCloseBracket, +/* 94 - ^ */ CharacterXor, +/* 95 - _ */ CharacterIdentifierStart, +/* 96 - ` */ CharacterInvalid, +/* 97 - a */ CharacterIdentifierStart, +/* 98 - b */ CharacterIdentifierStart, +/* 99 - c */ CharacterIdentifierStart, +/* 100 - d */ CharacterIdentifierStart, +/* 101 - e */ CharacterIdentifierStart, +/* 102 - f */ CharacterIdentifierStart, +/* 103 - g */ CharacterIdentifierStart, +/* 104 - h */ CharacterIdentifierStart, +/* 105 - i */ CharacterIdentifierStart, +/* 106 - j */ CharacterIdentifierStart, +/* 107 - k */ CharacterIdentifierStart, +/* 108 - l */ CharacterIdentifierStart, +/* 109 - m */ CharacterIdentifierStart, +/* 110 - n */ CharacterIdentifierStart, +/* 111 - o */ CharacterIdentifierStart, +/* 112 - p */ CharacterIdentifierStart, +/* 113 - q */ CharacterIdentifierStart, +/* 114 - r */ CharacterIdentifierStart, +/* 115 - s */ CharacterIdentifierStart, +/* 116 - t */ CharacterIdentifierStart, +/* 117 - u */ CharacterIdentifierStart, +/* 118 - v */ CharacterIdentifierStart, +/* 119 - w */ CharacterIdentifierStart, +/* 120 - x */ CharacterIdentifierStart, +/* 121 - y */ CharacterIdentifierStart, +/* 122 - z */ CharacterIdentifierStart, +/* 123 - { */ CharacterOpenBrace, +/* 124 - | */ CharacterOr, +/* 125 - } */ CharacterCloseBrace, +/* 126 - ~ */ CharacterTilde, +/* 127 - Delete */ CharacterInvalid, +}; Lexer::Lexer(JSGlobalData* globalData) - : yylineno(1) - , m_restrKeyword(false) - , m_eatNextIdentifier(false) - , m_stackToken(-1) - , m_lastToken(-1) - , m_position(0) - , m_code(0) - , m_length(0) - , m_isReparsing(false) - , m_atLineStart(true) - , m_current(0) - , m_next1(0) - , m_next2(0) - , m_next3(0) - , m_currentOffset(0) - , m_nextOffset1(0) - , m_nextOffset2(0) - , m_nextOffset3(0) + : m_isReparsing(false) , m_globalData(globalData) - , m_mainTable(JSC::mainTable) + , m_keywordTable(JSC::mainTable) { - m_buffer8.reserveInitialCapacity(initialReadBufferCapacity); - m_buffer16.reserveInitialCapacity(initialReadBufferCapacity); } Lexer::~Lexer() { - m_mainTable.deleteTable(); + m_keywordTable.deleteTable(); +} + +ALWAYS_INLINE const UChar* Lexer::currentCharacter() const +{ + ASSERT(m_code <= m_codeEnd); + return m_code; } -void Lexer::setCode(const SourceCode& source) +ALWAYS_INLINE int Lexer::currentOffset() const { - yylineno = source.firstLine(); - m_restrKeyword = false; + return currentCharacter() - m_codeStart; +} + +void Lexer::setCode(const SourceCode& source, ParserArena& arena) +{ + m_arena = &arena.identifierArena(); + + m_lineNumber = source.firstLine(); m_delimited = false; - m_eatNextIdentifier = false; - m_stackToken = -1; m_lastToken = -1; - m_position = source.startOffset(); + const UChar* data = source.provider()->data(); + m_source = &source; - m_code = source.provider()->data(); - m_length = source.endOffset(); - m_skipLF = false; - m_skipCR = false; + m_codeStart = data; + m_code = data + source.startOffset(); + m_codeEnd = data + source.endOffset(); m_error = false; m_atLineStart = true; - // read first characters - shift(4); + m_buffer8.reserveInitialCapacity(initialReadBufferCapacity); + m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2); + + if (LIKELY(m_code < m_codeEnd)) + m_current = *m_code; + else + m_current = -1; + ASSERT(currentOffset() == source.startOffset()); } -void Lexer::shift(unsigned p) +template ALWAYS_INLINE void Lexer::internalShift() { - // ECMA-262 calls for stripping Cf characters here, but we only do this for BOM, - // see . - - while (p--) { - m_current = m_next1; - m_next1 = m_next2; - m_next2 = m_next3; - m_currentOffset = m_nextOffset1; - m_nextOffset1 = m_nextOffset2; - m_nextOffset2 = m_nextOffset3; - do { - if (m_position >= m_length) { - m_nextOffset3 = m_position; - m_position++; - m_next3 = -1; - break; - } - m_nextOffset3 = m_position; - m_next3 = m_code[m_position++]; - } while (m_next3 == 0xFEFF); + if (shouldBoundsCheck == DoBoundsCheck) { + // Faster than an if-else sequence + ASSERT(m_current != -1); + m_current = -1; + m_code += shiftAmount; + if (LIKELY(m_code < m_codeEnd)) + m_current = *m_code; + } else { + m_code += shiftAmount; + m_current = *m_code; } } -// called on each new line -void Lexer::nextLine() +ALWAYS_INLINE void Lexer::shift() { - yylineno++; - m_atLineStart = true; + internalShift<1, DoBoundsCheck>(); } -void Lexer::setDone(State s) +ALWAYS_INLINE int Lexer::peek(int offset) { - m_state = s; - m_done = true; + // Only use if necessary + ASSERT(offset > 0 && offset < 5); + const UChar* code = m_code + offset; + return (code < m_codeEnd) ? *code : -1; } -int Lexer::lex(void* p1, void* p2) +int Lexer::getUnicodeCharacter() { - YYSTYPE* lvalp = static_cast(p1); - YYLTYPE* llocp = static_cast(p2); - int token = 0; - m_state = Start; - unsigned short stringType = 0; // either single or double quotes - m_buffer8.clear(); - m_buffer16.clear(); - m_done = false; - m_terminator = false; - m_skipLF = false; - m_skipCR = false; - - // did we push a token on the stack previously ? - // (after an automatic semicolon insertion) - if (m_stackToken >= 0) { - setDone(Other); - token = m_stackToken; - m_stackToken = 0; - } - int startOffset = m_currentOffset; - while (!m_done) { - if (m_skipLF && m_current != '\n') // found \r but not \n afterwards - m_skipLF = false; - if (m_skipCR && m_current != '\r') // found \n but not \r afterwards - m_skipCR = false; - if (m_skipLF || m_skipCR) { // found \r\n or \n\r -> eat the second one - m_skipLF = false; - m_skipCR = false; - shift(1); - } - switch (m_state) { - case Start: - startOffset = m_currentOffset; - if (isWhiteSpace()) { - // do nothing - } else if (m_current == '/' && m_next1 == '/') { - shift(1); - m_state = InSingleLineComment; - } else if (m_current == '/' && m_next1 == '*') { - shift(1); - m_state = InMultiLineComment; - } else if (m_current == -1) { - if (!m_terminator && !m_delimited && !m_isReparsing) { - // automatic semicolon insertion if program incomplete - token = ';'; - m_stackToken = 0; - setDone(Other); - } else - setDone(Eof); - } else if (isLineTerminator()) { - nextLine(); - m_terminator = true; - if (m_restrKeyword) { - token = ';'; - setDone(Other); - } - } else if (m_current == '"' || m_current == '\'') { - m_state = InString; - stringType = static_cast(m_current); - } else if (isIdentStart(m_current)) { - record16(m_current); - m_state = InIdentifierOrKeyword; - } else if (m_current == '\\') - m_state = InIdentifierStartUnicodeEscapeStart; - else if (m_current == '0') { - record8(m_current); - m_state = InNum0; - } else if (isDecimalDigit(m_current)) { - record8(m_current); - m_state = InNum; - } else if (m_current == '.' && isDecimalDigit(m_next1)) { - record8(m_current); - m_state = InDecimal; - // - } else if (m_atLineStart && m_current == '-' && m_next1 == '-' && m_next2 == '>') { - shift(2); - m_state = InSingleLineComment; - } else { - token = matchPunctuator(lvalp->intValue, m_current, m_next1, m_next2, m_next3); - if (token != -1) - setDone(Other); - else - setDone(Bad); - } - break; - case InString: - if (m_current == stringType) { - shift(1); - setDone(String); - } else if (isLineTerminator() || m_current == -1) - setDone(Bad); - else if (m_current == '\\') - m_state = InEscapeSequence; - else - record16(m_current); - break; - // Escape Sequences inside of strings - case InEscapeSequence: - if (isOctalDigit(m_current)) { - if (m_current >= '0' && m_current <= '3' && - isOctalDigit(m_next1) && isOctalDigit(m_next2)) { - record16(convertOctal(m_current, m_next1, m_next2)); - shift(2); - m_state = InString; - } else if (isOctalDigit(m_current) && isOctalDigit(m_next1)) { - record16(convertOctal('0', m_current, m_next1)); - shift(1); - m_state = InString; - } else if (isOctalDigit(m_current)) { - record16(convertOctal('0', '0', m_current)); - m_state = InString; - } else - setDone(Bad); - } else if (m_current == 'x') - m_state = InHexEscape; - else if (m_current == 'u') - m_state = InUnicodeEscape; - else if (isLineTerminator()) { - nextLine(); - m_state = InString; - } else { - record16(singleEscape(static_cast(m_current))); - m_state = InString; - } - break; - case InHexEscape: - if (isHexDigit(m_current) && isHexDigit(m_next1)) { - m_state = InString; - record16(convertHex(m_current, m_next1)); - shift(1); - } else if (m_current == stringType) { - record16('x'); - shift(1); - setDone(String); - } else { - record16('x'); - record16(m_current); - m_state = InString; - } - break; - case InUnicodeEscape: - if (isHexDigit(m_current) && isHexDigit(m_next1) && isHexDigit(m_next2) && isHexDigit(m_next3)) { - record16(convertUnicode(m_current, m_next1, m_next2, m_next3)); - shift(3); - m_state = InString; - } else if (m_current == stringType) { - record16('u'); - shift(1); - setDone(String); - } else - setDone(Bad); - break; - case InSingleLineComment: - if (isLineTerminator()) { - nextLine(); - m_terminator = true; - if (m_restrKeyword) { - token = ';'; - setDone(Other); - } else - m_state = Start; - } else if (m_current == -1) - setDone(Eof); - break; - case InMultiLineComment: - if (m_current == -1) - setDone(Bad); - else if (isLineTerminator()) - nextLine(); - else if (m_current == '*' && m_next1 == '/') { - m_state = Start; - shift(1); - } - break; - case InIdentifierOrKeyword: - case InIdentifier: - if (isIdentPart(m_current)) - record16(m_current); - else if (m_current == '\\') - m_state = InIdentifierPartUnicodeEscapeStart; - else - setDone(m_state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier); - break; - case InNum0: - if (m_current == 'x' || m_current == 'X') { - record8(m_current); - m_state = InHex; - } else if (m_current == '.') { - record8(m_current); - m_state = InDecimal; - } else if (m_current == 'e' || m_current == 'E') { - record8(m_current); - m_state = InExponentIndicator; - } else if (isOctalDigit(m_current)) { - record8(m_current); - m_state = InOctal; - } else if (isDecimalDigit(m_current)) { - record8(m_current); - m_state = InDecimal; - } else - setDone(Number); - break; - case InHex: - if (isHexDigit(m_current)) - record8(m_current); - else - setDone(Hex); - break; - case InOctal: - if (isOctalDigit(m_current)) - record8(m_current); - else if (isDecimalDigit(m_current)) { - record8(m_current); - m_state = InDecimal; - } else - setDone(Octal); - break; - case InNum: - if (isDecimalDigit(m_current)) - record8(m_current); - else if (m_current == '.') { - record8(m_current); - m_state = InDecimal; - } else if (m_current == 'e' || m_current == 'E') { - record8(m_current); - m_state = InExponentIndicator; - } else - setDone(Number); - break; - case InDecimal: - if (isDecimalDigit(m_current)) - record8(m_current); - else if (m_current == 'e' || m_current == 'E') { - record8(m_current); - m_state = InExponentIndicator; - } else - setDone(Number); - break; - case InExponentIndicator: - if (m_current == '+' || m_current == '-') - record8(m_current); - else if (isDecimalDigit(m_current)) { - record8(m_current); - m_state = InExponent; - } else - setDone(Bad); - break; - case InExponent: - if (isDecimalDigit(m_current)) - record8(m_current); - else - setDone(Number); - break; - case InIdentifierStartUnicodeEscapeStart: - if (m_current == 'u') - m_state = InIdentifierStartUnicodeEscape; - else - setDone(Bad); - break; - case InIdentifierPartUnicodeEscapeStart: - if (m_current == 'u') - m_state = InIdentifierPartUnicodeEscape; - else - setDone(Bad); - break; - case InIdentifierStartUnicodeEscape: - if (!isHexDigit(m_current) || !isHexDigit(m_next1) || !isHexDigit(m_next2) || !isHexDigit(m_next3)) { - setDone(Bad); - break; - } - token = convertUnicode(m_current, m_next1, m_next2, m_next3); - shift(3); - if (!isIdentStart(token)) { - setDone(Bad); - break; - } - record16(token); - m_state = InIdentifier; - break; - case InIdentifierPartUnicodeEscape: - if (!isHexDigit(m_current) || !isHexDigit(m_next1) || !isHexDigit(m_next2) || !isHexDigit(m_next3)) { - setDone(Bad); - break; - } - token = convertUnicode(m_current, m_next1, m_next2, m_next3); - shift(3); - if (!isIdentPart(token)) { - setDone(Bad); - break; - } - record16(token); - m_state = InIdentifier; - break; - default: - ASSERT(!"Unhandled state in switch statement"); - } + int char1 = peek(1); + int char2 = peek(2); + int char3 = peek(3); - // move on to the next character - if (!m_done) - shift(1); - if (m_state != Start && m_state != InSingleLineComment) - m_atLineStart = false; - } - - // no identifiers allowed directly after numeric literal, e.g. "3in" is bad - if ((m_state == Number || m_state == Octal || m_state == Hex) && isIdentStart(m_current)) - m_state = Bad; - - // terminate string - m_buffer8.append('\0'); + if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3))) + return -1; -#ifdef JSC_DEBUG_LEX - fprintf(stderr, "line: %d ", lineNo()); - fprintf(stderr, "yytext (%x): ", m_buffer8[0]); - fprintf(stderr, "%s ", m_buffer8.data()); -#endif - - double dval = 0; - if (m_state == Number) - dval = WTF::strtod(m_buffer8.data(), 0L); - else if (m_state == Hex) { // scan hex numbers - const char* p = m_buffer8.data() + 2; - while (char c = *p++) { - dval *= 16; - dval += convertHex(c); - } - - if (dval >= mantissaOverflowLowerBound) - dval = parseIntOverflow(m_buffer8.data() + 2, p - (m_buffer8.data() + 3), 16); + int result = convertUnicode(m_current, char1, char2, char3); + shift(); + shift(); + shift(); + shift(); + return result; +} - m_state = Number; - } else if (m_state == Octal) { // scan octal number - const char* p = m_buffer8.data() + 1; - while (char c = *p++) { - dval *= 8; - dval += c - '0'; - } +void Lexer::shiftLineTerminator() +{ + ASSERT(isLineTerminator(m_current)); - if (dval >= mantissaOverflowLowerBound) - dval = parseIntOverflow(m_buffer8.data() + 1, p - (m_buffer8.data() + 2), 8); + int m_prev = m_current; + shift(); - m_state = Number; - } + // Allow both CRLF and LFCR. + if (m_prev + m_current == '\n' + '\r') + shift(); -#ifdef JSC_DEBUG_LEX - switch (m_state) { - case Eof: - printf("(EOF)\n"); - break; - case Other: - printf("(Other)\n"); - break; - case Identifier: - printf("(Identifier)/(Keyword)\n"); - break; - case String: - printf("(String)\n"); - break; - case Number: - printf("(Number)\n"); - break; - default: - printf("(unknown)"); - } -#endif + ++m_lineNumber; +} - if (m_state != Identifier) - m_eatNextIdentifier = false; +ALWAYS_INLINE bool Lexer::lastTokenWasRestrKeyword() const +{ + return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW; +} - m_restrKeyword = false; - m_delimited = false; - llocp->first_line = yylineno; - llocp->last_line = yylineno; - llocp->first_column = startOffset; - llocp->last_column = m_currentOffset; - switch (m_state) { - case Eof: - token = 0; - break; - case Other: - if (token == '}' || token == ';') - m_delimited = true; - break; - case Identifier: - // Apply anonymous-function hack below (eat the identifier). - if (m_eatNextIdentifier) { - m_eatNextIdentifier = false; - token = lex(lvalp, llocp); - break; - } - lvalp->ident = makeIdentifier(m_buffer16); - token = IDENT; - break; - case IdentifierOrKeyword: { - lvalp->ident = makeIdentifier(m_buffer16); - const HashEntry* entry = m_mainTable.entry(m_globalData, *lvalp->ident); - if (!entry) { - // Lookup for keyword failed, means this is an identifier. - token = IDENT; - break; - } - token = entry->lexerValue(); - // Hack for "f = function somename() { ... }"; too hard to get into the grammar. - m_eatNextIdentifier = token == FUNCTION && m_lastToken == '='; - if (token == CONTINUE || token == BREAK || token == RETURN || token == THROW) - m_restrKeyword = true; - break; - } - case String: - // Atomize constant strings in case they're later used in property lookup. - lvalp->ident = makeIdentifier(m_buffer16); - token = STRING; - break; - case Number: - lvalp->doubleValue = dval; - token = NUMBER; - break; - case Bad: -#ifdef JSC_DEBUG_LEX - fprintf(stderr, "yylex: ERROR.\n"); -#endif - m_error = true; - return -1; - default: - ASSERT(!"unhandled numeration value in switch"); - m_error = true; - return -1; - } - m_lastToken = token; - return token; +static NEVER_INLINE bool isNonASCIIIdentStart(int c) +{ + return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other); } -bool Lexer::isWhiteSpace() const +static inline bool isIdentStart(int c) { - return m_current == '\t' || m_current == 0x0b || m_current == 0x0c || isSeparatorSpace(m_current); + return isASCII(c) ? typesOfASCIICharacters[c] == CharacterIdentifierStart : isNonASCIIIdentStart(c); } -bool Lexer::isLineTerminator() +static NEVER_INLINE bool isNonASCIIIdentPart(int c) { - bool cr = (m_current == '\r'); - bool lf = (m_current == '\n'); - if (cr) - m_skipLF = true; - else if (lf) - m_skipCR = true; - return cr || lf || m_current == 0x2028 || m_current == 0x2029; + return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other + | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector); } -bool Lexer::isIdentStart(int c) +static ALWAYS_INLINE bool isIdentPart(int c) { - return isASCIIAlpha(c) || c == '$' || c == '_' || (!isASCII(c) && (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other))); + // Character types are divided into two groups depending on whether they can be part of an + // identifier or not. Those whose type value is less or equal than CharacterNumber can be + // part of an identifier. (See the CharacterType definition for more details.) + return isASCII(c) ? typesOfASCIICharacters[c] <= CharacterNumber : isNonASCIIIdentPart(c); } -bool Lexer::isIdentPart(int c) +static inline int singleEscape(int c) { - return isASCIIAlphanumeric(c) || c == '$' || c == '_' || (!isASCII(c) && (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other - | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector))); + switch (c) { + case 'b': + return 0x08; + case 't': + return 0x09; + case 'n': + return 0x0A; + case 'v': + return 0x0B; + case 'f': + return 0x0C; + case 'r': + return 0x0D; + case '\\': + return '\\'; + case '\'': + return '\''; + case '"': + return '"'; + default: + return 0; + } } -static bool isDecimalDigit(int c) +inline void Lexer::record8(int c) { - return isASCIIDigit(c); + ASSERT(c >= 0); + ASSERT(c <= 0xFF); + m_buffer8.append(static_cast(c)); } -bool Lexer::isHexDigit(int c) +inline void Lexer::record16(UChar c) { - return isASCIIHexDigit(c); + m_buffer16.append(c); } -bool Lexer::isOctalDigit(int c) +inline void Lexer::record16(int c) { - return isASCIIOctalDigit(c); + ASSERT(c >= 0); + ASSERT(c <= USHRT_MAX); + record16(UChar(static_cast(c))); } -int Lexer::matchPunctuator(int& charPos, int c1, int c2, int c3, int c4) +template ALWAYS_INLINE JSTokenType Lexer::parseIdentifier(JSTokenData* tokenData, unsigned lexType) { - if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') { - shift(4); - return URSHIFTEQUAL; - } - if (c1 == '=' && c2 == '=' && c3 == '=') { - shift(3); - return STREQ; - } - if (c1 == '!' && c2 == '=' && c3 == '=') { - shift(3); - return STRNEQ; - } - if (c1 == '>' && c2 == '>' && c3 == '>') { - shift(3); - return URSHIFT; - } - if (c1 == '<' && c2 == '<' && c3 == '=') { - shift(3); - return LSHIFTEQUAL; - } - if (c1 == '>' && c2 == '>' && c3 == '=') { - shift(3); - return RSHIFTEQUAL; - } - if (c1 == '<' && c2 == '=') { - shift(2); - return LE; - } - if (c1 == '>' && c2 == '=') { - shift(2); - return GE; - } - if (c1 == '!' && c2 == '=') { - shift(2); - return NE; - } - if (c1 == '+' && c2 == '+') { - shift(2); - if (m_terminator) - return AUTOPLUSPLUS; - return PLUSPLUS; - } - if (c1 == '-' && c2 == '-') { - shift(2); - if (m_terminator) - return AUTOMINUSMINUS; - return MINUSMINUS; - } - if (c1 == '=' && c2 == '=') { - shift(2); - return EQEQ; - } - if (c1 == '+' && c2 == '=') { - shift(2); - return PLUSEQUAL; - } - if (c1 == '-' && c2 == '=') { - shift(2); - return MINUSEQUAL; - } - if (c1 == '*' && c2 == '=') { - shift(2); - return MULTEQUAL; - } - if (c1 == '/' && c2 == '=') { - shift(2); - return DIVEQUAL; - } - if (c1 == '&' && c2 == '=') { - shift(2); - return ANDEQUAL; - } - if (c1 == '^' && c2 == '=') { - shift(2); - return XOREQUAL; - } - if (c1 == '%' && c2 == '=') { - shift(2); - return MODEQUAL; + const ptrdiff_t remaining = m_codeEnd - m_code; + if ((remaining >= maxTokenLength) && !(lexType & IgnoreReservedWords)) { + JSTokenType keyword = parseKeyword(tokenData); + if (keyword != IDENT) { + ASSERT((!shouldCreateIdentifier) || tokenData->ident); + return keyword; + } } - if (c1 == '|' && c2 == '=') { - shift(2); - return OREQUAL; + const UChar* identifierStart = currentCharacter(); + bool bufferRequired = false; + + while (true) { + if (LIKELY(isIdentPart(m_current))) { + shift(); + continue; + } + if (LIKELY(m_current != '\\')) + break; + + // \uXXXX unicode characters. + bufferRequired = true; + if (identifierStart != currentCharacter()) + m_buffer16.append(identifierStart, currentCharacter() - identifierStart); + shift(); + if (UNLIKELY(m_current != 'u')) + return ERRORTOK; + shift(); + int character = getUnicodeCharacter(); + if (UNLIKELY(character == -1)) + return ERRORTOK; + if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character) : !isIdentStart(character))) + return ERRORTOK; + if (shouldCreateIdentifier) + record16(character); + identifierStart = currentCharacter(); } - if (c1 == '<' && c2 == '<') { - shift(2); - return LSHIFT; + + int identifierLength; + const Identifier* ident = 0; + if (shouldCreateIdentifier) { + if (!bufferRequired) + identifierLength = currentCharacter() - identifierStart; + else { + if (identifierStart != currentCharacter()) + m_buffer16.append(identifierStart, currentCharacter() - identifierStart); + identifierStart = m_buffer16.data(); + identifierLength = m_buffer16.size(); + } + + ident = makeIdentifier(identifierStart, identifierLength); + tokenData->ident = ident; + } else + tokenData->ident = 0; + + m_delimited = false; + + if (LIKELY(!bufferRequired && !(lexType & IgnoreReservedWords))) { + ASSERT(shouldCreateIdentifier); + // Keywords must not be recognized if there was an \uXXXX in the identifier. + if (remaining < maxTokenLength) { + const HashEntry* entry = m_keywordTable.entry(m_globalData, *ident); + ASSERT((remaining < maxTokenLength) || !entry); + return entry ? static_cast(entry->lexerValue()) : IDENT; + } + return IDENT; } - if (c1 == '>' && c2 == '>') { - shift(2); - return RSHIFT; + + m_buffer16.resize(0); + return IDENT; +} + +bool Lexer::isKeyword(const Identifier& ident) +{ + return m_keywordTable.entry(m_globalData, ident); +} + +template ALWAYS_INLINE bool Lexer::parseString(JSTokenData* tokenData, bool strictMode) +{ + int stringQuoteCharacter = m_current; + shift(); + + const UChar* stringStart = currentCharacter(); + + while (m_current != stringQuoteCharacter) { + if (UNLIKELY(m_current == '\\')) { + if (stringStart != currentCharacter() && shouldBuildStrings) + m_buffer16.append(stringStart, currentCharacter() - stringStart); + shift(); + + int escape = singleEscape(m_current); + + // Most common escape sequences first + if (escape) { + if (shouldBuildStrings) + record16(escape); + shift(); + } else if (UNLIKELY(isLineTerminator(m_current))) + shiftLineTerminator(); + else if (m_current == 'x') { + shift(); + if (isASCIIHexDigit(m_current) && isASCIIHexDigit(peek(1))) { + int prev = m_current; + shift(); + if (shouldBuildStrings) + record16(convertHex(prev, m_current)); + shift(); + } else if (shouldBuildStrings) + record16('x'); + } else if (m_current == 'u') { + shift(); + int character = getUnicodeCharacter(); + if (character != -1) { + if (shouldBuildStrings) + record16(character); + } else if (m_current == stringQuoteCharacter) { + if (shouldBuildStrings) + record16('u'); + } else // Only stringQuoteCharacter allowed after \u + return false; + } else if (strictMode && isASCIIDigit(m_current)) { + // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit. + int character1 = m_current; + shift(); + if (character1 != '0' || isASCIIDigit(m_current)) + return false; + if (shouldBuildStrings) + record16(0); + } else if (!strictMode && isASCIIOctalDigit(m_current)) { + // Octal character sequences + int character1 = m_current; + shift(); + if (isASCIIOctalDigit(m_current)) { + // Two octal characters + int character2 = m_current; + shift(); + if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) { + if (shouldBuildStrings) + record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0'); + shift(); + } else { + if (shouldBuildStrings) + record16((character1 - '0') * 8 + character2 - '0'); + } + } else { + if (shouldBuildStrings) + record16(character1 - '0'); + } + } else if (m_current != -1) { + if (shouldBuildStrings) + record16(m_current); + shift(); + } else + return false; + + stringStart = currentCharacter(); + continue; + } + // Fast check for characters that require special handling. + // Catches -1, \n, \r, 0x2028, and 0x2029 as efficiently + // as possible, and lets through all common ASCII characters. + if (UNLIKELY(((static_cast(m_current) - 0xE) & 0x2000))) { + // New-line or end of input is not allowed + if (UNLIKELY(isLineTerminator(m_current)) || UNLIKELY(m_current == -1)) + return false; + // Anything else is just a normal character + } + shift(); } - if (c1 == '&' && c2 == '&') { - shift(2); - return AND; + + if (currentCharacter() != stringStart && shouldBuildStrings) + m_buffer16.append(stringStart, currentCharacter() - stringStart); + if (shouldBuildStrings) + tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size()); + else + tokenData->ident = 0; + + m_buffer16.resize(0); + return true; +} + +ALWAYS_INLINE void Lexer::parseHex(double& returnValue) +{ + // Optimization: most hexadecimal values fit into 4 bytes. + uint32_t hexValue = 0; + int maximumDigits = 7; + + // Shift out the 'x' prefix. + shift(); + + do { + hexValue = (hexValue << 4) + toASCIIHexValue(m_current); + shift(); + --maximumDigits; + } while (isASCIIHexDigit(m_current) && maximumDigits >= 0); + + if (maximumDigits >= 0) { + returnValue = hexValue; + return; } - if (c1 == '|' && c2 == '|') { - shift(2); - return OR; + + // No more place in the hexValue buffer. + // The values are shifted out and placed into the m_buffer8 vector. + for (int i = 0; i < 8; ++i) { + int digit = hexValue >> 28; + if (digit < 10) + record8(digit + '0'); + else + record8(digit - 10 + 'a'); + hexValue <<= 4; } - switch (c1) { - case '=': - case '>': - case '<': - case ',': - case '!': - case '~': - case '?': - case ':': - case '.': - case '+': - case '-': - case '*': - case '/': - case '&': - case '|': - case '^': - case '%': - case '(': - case ')': - case '[': - case ']': - case ';': - shift(1); - return static_cast(c1); - case '{': - charPos = m_currentOffset; - shift(1); - return OPENBRACE; - case '}': - charPos = m_currentOffset; - shift(1); - return CLOSEBRACE; - default: - return -1; + while (isASCIIHexDigit(m_current)) { + record8(m_current); + shift(); } + + returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16); } -unsigned short Lexer::singleEscape(unsigned short c) +ALWAYS_INLINE bool Lexer::parseOctal(double& returnValue) { - switch (c) { - case 'b': - return 0x08; - case 't': - return 0x09; - case 'n': - return 0x0A; - case 'v': - return 0x0B; - case 'f': - return 0x0C; - case 'r': - return 0x0D; - case '"': - return 0x22; - case '\'': - return 0x27; - case '\\': - return 0x5C; - default: - return c; + // Optimization: most octal values fit into 4 bytes. + uint32_t octalValue = 0; + int maximumDigits = 9; + // Temporary buffer for the digits. Makes easier + // to reconstruct the input characters when needed. + char digits[10]; + + do { + octalValue = octalValue * 8 + (m_current - '0'); + digits[maximumDigits] = m_current; + shift(); + --maximumDigits; + } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0); + + if (!isASCIIDigit(m_current) && maximumDigits >= 0) { + returnValue = octalValue; + return true; } -} -unsigned short Lexer::convertOctal(int c1, int c2, int c3) -{ - return static_cast((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0'); + for (int i = 9; i > maximumDigits; --i) + record8(digits[i]); + + while (isASCIIOctalDigit(m_current)) { + record8(m_current); + shift(); + } + + if (isASCIIDigit(m_current)) + return false; + + returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8); + return true; } -unsigned char Lexer::convertHex(int c) +ALWAYS_INLINE bool Lexer::parseDecimal(double& returnValue) { - if (c >= '0' && c <= '9') - return static_cast(c - '0'); - if (c >= 'a' && c <= 'f') - return static_cast(c - 'a' + 10); - return static_cast(c - 'A' + 10); + // Optimization: most decimal values fit into 4 bytes. + uint32_t decimalValue = 0; + + // Since parseOctal may be executed before parseDecimal, + // the m_buffer8 may hold ascii digits. + if (!m_buffer8.size()) { + int maximumDigits = 9; + // Temporary buffer for the digits. Makes easier + // to reconstruct the input characters when needed. + char digits[10]; + + do { + decimalValue = decimalValue * 10 + (m_current - '0'); + digits[maximumDigits] = m_current; + shift(); + --maximumDigits; + } while (isASCIIDigit(m_current) && maximumDigits >= 0); + + if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') { + returnValue = decimalValue; + return true; + } + + for (int i = 9; i > maximumDigits; --i) + record8(digits[i]); + } + + while (isASCIIDigit(m_current)) { + record8(m_current); + shift(); + } + + return false; } -unsigned char Lexer::convertHex(int c1, int c2) +ALWAYS_INLINE void Lexer::parseNumberAfterDecimalPoint() { - return ((convertHex(c1) << 4) + convertHex(c2)); + record8('.'); + while (isASCIIDigit(m_current)) { + record8(m_current); + shift(); + } } -UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4) +ALWAYS_INLINE bool Lexer::parseNumberAfterExponentIndicator() { - unsigned char highByte = (convertHex(c1) << 4) + convertHex(c2); - unsigned char lowByte = (convertHex(c3) << 4) + convertHex(c4); - return (highByte << 8 | lowByte); + record8('e'); + shift(); + if (m_current == '+' || m_current == '-') { + record8(m_current); + shift(); + } + + if (!isASCIIDigit(m_current)) + return false; + + do { + record8(m_current); + shift(); + } while (isASCIIDigit(m_current)); + return true; } -void Lexer::record8(int c) +ALWAYS_INLINE bool Lexer::parseMultilineComment() { - ASSERT(c >= 0); - ASSERT(c <= 0xff); - m_buffer8.append(static_cast(c)); + while (true) { + while (UNLIKELY(m_current == '*')) { + shift(); + if (m_current == '/') { + shift(); + return true; + } + } + + if (UNLIKELY(m_current == -1)) + return false; + + if (isLineTerminator(m_current)) + shiftLineTerminator(); + else + shift(); + } } -void Lexer::record16(int c) +bool Lexer::nextTokenIsColon() { - ASSERT(c >= 0); - ASSERT(c <= USHRT_MAX); - record16(UChar(static_cast(c))); + const UChar* code = m_code; + while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code))) + code++; + + return code < m_codeEnd && *code == ':'; } -void Lexer::record16(UChar c) +JSTokenType Lexer::lex(JSTokenData* tokenData, JSTokenInfo* tokenInfo, unsigned lexType, bool strictMode) { - m_buffer16.append(c); + ASSERT(!m_error); + ASSERT(m_buffer8.isEmpty()); + ASSERT(m_buffer16.isEmpty()); + + JSTokenType token = ERRORTOK; + m_terminator = false; + +start: + while (isWhiteSpace(m_current)) + shift(); + + int startOffset = currentOffset(); + + if (UNLIKELY(m_current == -1)) + return EOFTOK; + + m_delimited = false; + + CharacterType type; + if (LIKELY(isASCII(m_current))) + type = static_cast(typesOfASCIICharacters[m_current]); + else if (isNonASCIIIdentStart(m_current)) + type = CharacterIdentifierStart; + else if (isLineTerminator(m_current)) + type = CharacterLineTerminator; + else + type = CharacterInvalid; + + switch (type) { + case CharacterGreater: + shift(); + if (m_current == '>') { + shift(); + if (m_current == '>') { + shift(); + if (m_current == '=') { + shift(); + token = URSHIFTEQUAL; + break; + } + token = URSHIFT; + break; + } + if (m_current == '=') { + shift(); + token = RSHIFTEQUAL; + break; + } + token = RSHIFT; + break; + } + if (m_current == '=') { + shift(); + token = GE; + break; + } + token = GT; + break; + case CharacterEqual: + shift(); + if (m_current == '=') { + shift(); + if (m_current == '=') { + shift(); + token = STREQ; + break; + } + token = EQEQ; + break; + } + token = EQUAL; + break; + case CharacterLess: + shift(); + if (m_current == '!' && peek(1) == '-' && peek(2) == '-') { + //