X-Git-Url: https://git.saurik.com/apple/javascriptcore.git/blobdiff_plain/6fe7ccc865dc7d7541b93c5bcaf6368d2c98a174..HEAD:/parser/Lexer.cpp diff --git a/parser/Lexer.cpp b/parser/Lexer.cpp index 3b020f4..53aa9b0 100644 --- a/parser/Lexer.cpp +++ b/parser/Lexer.cpp @@ -1,6 +1,6 @@ /* * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) - * Copyright (C) 2006, 2007, 2008, 2009, 2011, 2012 Apple Inc. All Rights Reserved. + * Copyright (C) 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All Rights Reserved. * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu) * Copyright (C) 2012 Mathias Bynens (mathias@qiwi.be) @@ -25,29 +25,27 @@ #include "config.h" #include "Lexer.h" -#include "JSFunction.h" +#include "JSFunctionInlines.h" +#include "BuiltinNames.h" #include "JSGlobalObjectFunctions.h" #include "Identifier.h" -#include "NodeInfo.h" #include "Nodes.h" +#include "JSCInlines.h" #include #include #include #include #include -using namespace WTF; -using namespace Unicode; - #include "KeywordLookup.h" #include "Lexer.lut.h" #include "Parser.h" namespace JSC { -Keywords::Keywords(JSGlobalData* globalData) - : m_globalData(globalData) +Keywords::Keywords(VM& vm) + : m_vm(vm) , m_keywordTable(JSC::mainTable) { } @@ -73,6 +71,7 @@ enum CharacterType { CharacterQuestion, CharacterTilde, CharacterQuote, + CharacterBackQuote, CharacterDot, CharacterSlash, CharacterBackSlash, @@ -93,6 +92,7 @@ enum CharacterType { // Other types (only one so far) CharacterWhiteSpace, + CharacterPrivateIdentifierStart }; // 256 Latin-1 codes @@ -161,7 +161,7 @@ static const unsigned short typesOfLatin1Characters[256] = { /* 61 - = */ CharacterEqual, /* 62 - > */ CharacterGreater, /* 63 - ? */ CharacterQuestion, -/* 64 - @ */ CharacterInvalid, +/* 64 - @ */ CharacterPrivateIdentifierStart, /* 65 - A */ CharacterIdentifierStart, /* 66 - B */ CharacterIdentifierStart, /* 67 - C */ CharacterIdentifierStart, @@ -193,7 +193,11 @@ static const unsigned short typesOfLatin1Characters[256] = { /* 93 - ] */ CharacterCloseBracket, /* 94 - ^ */ CharacterXor, /* 95 - _ */ CharacterIdentifierStart, +#if ENABLE(ES6_TEMPLATE_LITERAL_SYNTAX) +/* 96 - ` */ CharacterBackQuote, +#else /* 96 - ` */ CharacterInvalid, +#endif /* 97 - a */ CharacterIdentifierStart, /* 98 - b */ CharacterIdentifierStart, /* 99 - c */ CharacterIdentifierStart, @@ -355,11 +359,152 @@ static const unsigned short typesOfLatin1Characters[256] = { /* 255 - Ll category */ CharacterIdentifierStart }; +// This table provides the character that results from \X where X is the index in the table beginning +// with SPACE. A table value of 0 means that more processing needs to be done. +static const LChar singleCharacterEscapeValuesForASCII[128] = { +/* 0 - Null */ 0, +/* 1 - Start of Heading */ 0, +/* 2 - Start of Text */ 0, +/* 3 - End of Text */ 0, +/* 4 - End of Transm. */ 0, +/* 5 - Enquiry */ 0, +/* 6 - Acknowledgment */ 0, +/* 7 - Bell */ 0, +/* 8 - Back Space */ 0, +/* 9 - Horizontal Tab */ 0, +/* 10 - Line Feed */ 0, +/* 11 - Vertical Tab */ 0, +/* 12 - Form Feed */ 0, +/* 13 - Carriage Return */ 0, +/* 14 - Shift Out */ 0, +/* 15 - Shift In */ 0, +/* 16 - Data Line Escape */ 0, +/* 17 - Device Control 1 */ 0, +/* 18 - Device Control 2 */ 0, +/* 19 - Device Control 3 */ 0, +/* 20 - Device Control 4 */ 0, +/* 21 - Negative Ack. */ 0, +/* 22 - Synchronous Idle */ 0, +/* 23 - End of Transmit */ 0, +/* 24 - Cancel */ 0, +/* 25 - End of Medium */ 0, +/* 26 - Substitute */ 0, +/* 27 - Escape */ 0, +/* 28 - File Separator */ 0, +/* 29 - Group Separator */ 0, +/* 30 - Record Separator */ 0, +/* 31 - Unit Separator */ 0, +/* 32 - Space */ ' ', +/* 33 - ! */ '!', +/* 34 - " */ '"', +/* 35 - # */ '#', +/* 36 - $ */ '$', +/* 37 - % */ '%', +/* 38 - & */ '&', +/* 39 - ' */ '\'', +/* 40 - ( */ '(', +/* 41 - ) */ ')', +/* 42 - * */ '*', +/* 43 - + */ '+', +/* 44 - , */ ',', +/* 45 - - */ '-', +/* 46 - . */ '.', +/* 47 - / */ '/', +/* 48 - 0 */ 0, +/* 49 - 1 */ 0, +/* 50 - 2 */ 0, +/* 51 - 3 */ 0, +/* 52 - 4 */ 0, +/* 53 - 5 */ 0, +/* 54 - 6 */ 0, +/* 55 - 7 */ 0, +/* 56 - 8 */ 0, +/* 57 - 9 */ 0, +/* 58 - : */ ':', +/* 59 - ; */ ';', +/* 60 - < */ '<', +/* 61 - = */ '=', +/* 62 - > */ '>', +/* 63 - ? */ '?', +/* 64 - @ */ '@', +/* 65 - A */ 'A', +/* 66 - B */ 'B', +/* 67 - C */ 'C', +/* 68 - D */ 'D', +/* 69 - E */ 'E', +/* 70 - F */ 'F', +/* 71 - G */ 'G', +/* 72 - H */ 'H', +/* 73 - I */ 'I', +/* 74 - J */ 'J', +/* 75 - K */ 'K', +/* 76 - L */ 'L', +/* 77 - M */ 'M', +/* 78 - N */ 'N', +/* 79 - O */ 'O', +/* 80 - P */ 'P', +/* 81 - Q */ 'Q', +/* 82 - R */ 'R', +/* 83 - S */ 'S', +/* 84 - T */ 'T', +/* 85 - U */ 'U', +/* 86 - V */ 'V', +/* 87 - W */ 'W', +/* 88 - X */ 'X', +/* 89 - Y */ 'Y', +/* 90 - Z */ 'Z', +/* 91 - [ */ '[', +/* 92 - \ */ '\\', +/* 93 - ] */ ']', +/* 94 - ^ */ '^', +/* 95 - _ */ '_', +/* 96 - ` */ '`', +/* 97 - a */ 'a', +/* 98 - b */ 0x08, +/* 99 - c */ 'c', +/* 100 - d */ 'd', +/* 101 - e */ 'e', +/* 102 - f */ 0x0C, +/* 103 - g */ 'g', +/* 104 - h */ 'h', +/* 105 - i */ 'i', +/* 106 - j */ 'j', +/* 107 - k */ 'k', +/* 108 - l */ 'l', +/* 109 - m */ 'm', +/* 110 - n */ 0x0A, +/* 111 - o */ 'o', +/* 112 - p */ 'p', +/* 113 - q */ 'q', +/* 114 - r */ 0x0D, +/* 115 - s */ 's', +/* 116 - t */ 0x09, +/* 117 - u */ 0, +/* 118 - v */ 0x0B, +/* 119 - w */ 'w', +/* 120 - x */ 0, +/* 121 - y */ 'y', +/* 122 - z */ 'z', +/* 123 - { */ '{', +/* 124 - | */ '|', +/* 125 - } */ '}', +/* 126 - ~ */ '~', +/* 127 - Delete */ 0 +}; + template -Lexer::Lexer(JSGlobalData* globalData) +Lexer::Lexer(VM* vm, JSParserBuiltinMode builtinMode) : m_isReparsing(false) - , m_globalData(globalData) + , m_vm(vm) + , m_parsingBuiltinFunction(builtinMode == JSParserBuiltinMode::Builtin) +{ +} + +static inline JSTokenType tokenTypeForIntegerLikeToken(double doubleValue) { + if ((doubleValue || !std::signbit(doubleValue)) && static_cast(doubleValue) == doubleValue) + return INTEGER; + return DOUBLE; } template @@ -368,30 +513,30 @@ Lexer::~Lexer() } template -UString Lexer::invalidCharacterMessage() const +String Lexer::invalidCharacterMessage() const { switch (m_current) { case 0: - return "Invalid character: '\\0'"; + return ASCIILiteral("Invalid character: '\\0'"); case 10: - return "Invalid character: '\\n'"; + return ASCIILiteral("Invalid character: '\\n'"); case 11: - return "Invalid character: '\\v'"; + return ASCIILiteral("Invalid character: '\\v'"); case 13: - return "Invalid character: '\\r'"; + return ASCIILiteral("Invalid character: '\\r'"); case 35: - return "Invalid character: '#'"; + return ASCIILiteral("Invalid character: '#'"); case 64: - return "Invalid character: '@'"; + return ASCIILiteral("Invalid character: '@'"); case 96: - return "Invalid character: '`'"; + return ASCIILiteral("Invalid character: '`'"); default: - return String::format("Invalid character '\\u%04u'", static_cast(m_current)).impl(); + return String::format("Invalid character '\\u%04u'", static_cast(m_current)); } } template -ALWAYS_INLINE const T* Lexer::currentCharacter() const +ALWAYS_INLINE const T* Lexer::currentSourcePtr() const { ASSERT(m_code <= m_codeEnd); return m_code; @@ -405,22 +550,26 @@ void Lexer::setCode(const SourceCode& source, ParserArena* arena) m_lineNumber = source.firstLine(); m_lastToken = -1; - const StringImpl* sourceString = source.provider()->data(); + const String& sourceString = source.provider()->source(); - if (sourceString) - setCodeStart(sourceString); + if (!sourceString.isNull()) + setCodeStart(sourceString.impl()); else m_codeStart = 0; m_source = &source; - m_code = m_codeStart + source.startOffset(); + m_sourceOffset = source.startOffset(); + m_codeStartPlusOffset = m_codeStart + source.startOffset(); + m_code = m_codeStartPlusOffset; m_codeEnd = m_codeStart + source.endOffset(); m_error = false; m_atLineStart = true; - m_lexErrorMessage = UString(); + m_lineStart = m_code; + m_lexErrorMessage = String(); m_buffer8.reserveInitialCapacity(initialReadBufferCapacity); m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2); + m_bufferForRawTemplateString16.reserveInitialCapacity(initialReadBufferCapacity); if (LIKELY(m_code < m_codeEnd)) m_current = *m_code; @@ -433,6 +582,7 @@ template template ALWAYS_INLINE void Lexer::internalShift() { m_code += shiftAmount; + ASSERT(currentOffset() >= currentLineStartOffset()); m_current = *m_code; } @@ -461,17 +611,55 @@ ALWAYS_INLINE T Lexer::peek(int offset) const return (code < m_codeEnd) ? *code : 0; } -template -int Lexer::parseFourDigitUnicodeHex() -{ - T char1 = peek(1); - T char2 = peek(2); - T char3 = peek(3); +struct ParsedUnicodeEscapeValue { + ParsedUnicodeEscapeValue(UChar32 value) + : m_value(value) + { + ASSERT(isValid()); + } + + enum SpecialValueType { Incomplete = -2, Invalid = -1 }; + ParsedUnicodeEscapeValue(SpecialValueType type) + : m_value(type) + { + } - if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3))) - return -1; + bool isValid() const { return m_value >= 0; } + bool isIncomplete() const { return m_value == Incomplete; } + + UChar32 value() const + { + ASSERT(isValid()); + return m_value; + } + +private: + UChar32 m_value; +}; + +template ParsedUnicodeEscapeValue Lexer::parseUnicodeEscape() +{ + if (m_current == '{') { + shift(); + UChar32 codePoint = 0; + do { + if (!isASCIIHexDigit(m_current)) + return m_current ? ParsedUnicodeEscapeValue::Invalid : ParsedUnicodeEscapeValue::Incomplete; + codePoint = (codePoint << 4) | toASCIIHexValue(m_current); + if (codePoint > UCHAR_MAX_VALUE) + return ParsedUnicodeEscapeValue::Invalid; + shift(); + } while (m_current != '}'); + shift(); + return codePoint; + } - int result = convertUnicode(m_current, char1, char2, char3); + auto character2 = peek(1); + auto character3 = peek(2); + auto character4 = peek(3); + if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(character2) || !isASCIIHexDigit(character3) || !isASCIIHexDigit(character4))) + return (m_code + 4) >= m_codeEnd ? ParsedUnicodeEscapeValue::Incomplete : ParsedUnicodeEscapeValue::Invalid; + auto result = convertUnicode(m_current, character2, character3, character4); shift(); shift(); shift(); @@ -484,6 +672,7 @@ void Lexer::shiftLineTerminator() { ASSERT(isLineTerminator(m_current)); + m_positionBeforeLastNewline = currentPosition(); T prev = m_current; shift(); @@ -500,9 +689,9 @@ ALWAYS_INLINE bool Lexer::lastTokenWasRestrKeyword() const return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW; } -static NEVER_INLINE bool isNonLatin1IdentStart(int c) +static NEVER_INLINE bool isNonLatin1IdentStart(UChar c) { - return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other); + return U_GET_GC_MASK(c) & U_GC_L_MASK; } static ALWAYS_INLINE bool isLatin1(LChar) @@ -515,20 +704,25 @@ static ALWAYS_INLINE bool isLatin1(UChar c) return c < 256; } +static ALWAYS_INLINE bool isLatin1(UChar32 c) +{ + return !(c & ~0xFF); +} + static inline bool isIdentStart(LChar c) { return typesOfLatin1Characters[c] == CharacterIdentifierStart; } -static inline bool isIdentStart(UChar c) +static inline bool isIdentStart(UChar32 c) { return isLatin1(c) ? isIdentStart(static_cast(c)) : isNonLatin1IdentStart(c); } -static NEVER_INLINE bool isNonLatin1IdentPart(int c) +static NEVER_INLINE bool isNonLatin1IdentPart(UChar32 c) { - return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other - | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector)) || c == 0x200C || c == 0x200D; + // FIXME: ES6 says this should be based on the Unicode property ID_Continue now instead. + return (U_GET_GC_MASK(c) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || c == 0x200C || c == 0x200D; } static ALWAYS_INLINE bool isIdentPart(LChar c) @@ -539,35 +733,68 @@ static ALWAYS_INLINE bool isIdentPart(LChar c) return typesOfLatin1Characters[c] <= CharacterNumber; } -static ALWAYS_INLINE bool isIdentPart(UChar c) +static ALWAYS_INLINE bool isIdentPart(UChar32 c) { return isLatin1(c) ? isIdentPart(static_cast(c)) : isNonLatin1IdentPart(c); } -static inline int singleEscape(int c) +static ALWAYS_INLINE bool isIdentPart(UChar c) { - switch (c) { - case 'b': - return 0x08; - case 't': - return 0x09; - case 'n': - return 0x0A; - case 'v': - return 0x0B; - case 'f': - return 0x0C; - case 'r': - return 0x0D; - case '\\': - return '\\'; - case '\'': - return '\''; - case '"': - return '"'; - default: - return 0; + return isIdentPart(static_cast(c)); +} + +template ALWAYS_INLINE bool isIdentPartIncludingEscapeTemplate(const CharacterType* code, const CharacterType* codeEnd) +{ + if (isIdentPart(code[0])) + return true; + + // Shortest sequence handled below is \u{0}, which is 5 characters. + if (!(code[0] == '\\' && codeEnd - code >= 5 && code[1] == 'u')) + return false; + + if (code[2] == '{') { + UChar32 codePoint = 0; + const CharacterType* pointer; + for (pointer = &code[3]; pointer < codeEnd; ++pointer) { + auto digit = *pointer; + if (!isASCIIHexDigit(digit)) + break; + codePoint = (codePoint << 4) | toASCIIHexValue(digit); + if (codePoint > UCHAR_MAX_VALUE) + return false; + } + return isIdentPart(codePoint) && pointer < codeEnd && *pointer == '}'; } + + // Shortest sequence handled below is \uXXXX, which is 6 characters. + if (codeEnd - code < 6) + return false; + + auto character1 = code[2]; + auto character2 = code[3]; + auto character3 = code[4]; + auto character4 = code[5]; + return isASCIIHexDigit(character1) && isASCIIHexDigit(character2) && isASCIIHexDigit(character3) && isASCIIHexDigit(character4) + && isIdentPart(Lexer::convertUnicode(character1, character2, character3, character4)); +} + +static ALWAYS_INLINE bool isIdentPartIncludingEscape(const LChar* code, const LChar* codeEnd) +{ + return isIdentPartIncludingEscapeTemplate(code, codeEnd); +} + +static ALWAYS_INLINE bool isIdentPartIncludingEscape(const UChar* code, const UChar* codeEnd) +{ + return isIdentPartIncludingEscapeTemplate(code, codeEnd); +} + +static inline LChar singleEscape(int c) +{ + if (c < 128) { + ASSERT(static_cast(c) < ARRAY_SIZE(singleCharacterEscapeValuesForASCII)); + return singleCharacterEscapeValuesForASCII[c]; + } + return 0; } template @@ -636,7 +863,40 @@ inline void Lexer::record16(int c) ASSERT(c <= static_cast(USHRT_MAX)); m_buffer16.append(static_cast(c)); } + +template inline void Lexer::recordUnicodeCodePoint(UChar32 codePoint) +{ + ASSERT(codePoint >= 0); + ASSERT(codePoint <= UCHAR_MAX_VALUE); + if (U_IS_BMP(codePoint)) + record16(codePoint); + else { + UChar codeUnits[2] = { U16_LEAD(codePoint), U16_TRAIL(codePoint) }; + append16(codeUnits, 2); + } +} +#if !ASSERT_DISABLED +bool isSafeBuiltinIdentifier(VM& vm, const Identifier* ident) +{ + if (!ident) + return true; + /* Just block any use of suspicious identifiers. This is intended to + * be used as a safety net while implementing builtins. + */ + // FIXME: How can a debug-only assertion be a safety net? + if (*ident == vm.propertyNames->builtinNames().callPublicName()) + return false; + if (*ident == vm.propertyNames->builtinNames().applyPublicName()) + return false; + if (*ident == vm.propertyNames->eval) + return false; + if (*ident == vm.propertyNames->Function) + return false; + return true; +} +#endif + template <> template ALWAYS_INLINE JSTokenType Lexer::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode) { @@ -648,31 +908,47 @@ template ALWAYS_INLINE JSTokenType Lexer::p return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword; } } - - const LChar* identifierStart = currentCharacter(); + + bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction; + if (isPrivateName) + shift(); + + const LChar* identifierStart = currentSourcePtr(); + unsigned identifierLineStart = currentLineStartOffset(); while (isIdentPart(m_current)) shift(); if (UNLIKELY(m_current == '\\')) { - setOffsetFromCharOffset(identifierStart); + setOffsetFromSourcePtr(identifierStart, identifierLineStart); return parseIdentifierSlowCase(tokenData, lexerFlags, strictMode); } const Identifier* ident = 0; - if (shouldCreateIdentifier) { - int identifierLength = currentCharacter() - identifierStart; + if (shouldCreateIdentifier || m_parsingBuiltinFunction) { + int identifierLength = currentSourcePtr() - identifierStart; ident = makeIdentifier(identifierStart, identifierLength); - + if (m_parsingBuiltinFunction) { + if (!isSafeBuiltinIdentifier(*m_vm, ident) && !isPrivateName) { + m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions."); + return ERRORTOK; + } + if (isPrivateName) + ident = m_vm->propertyNames->getPrivateName(*ident); + else if (*ident == m_vm->propertyNames->undefinedKeyword) + tokenData->ident = &m_vm->propertyNames->undefinedPrivateName; + if (!ident) + return INVALID_PRIVATE_NAME_ERRORTOK; + } tokenData->ident = ident; } else tokenData->ident = 0; - if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords))) { + if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) && !isPrivateName) { ASSERT(shouldCreateIdentifier); if (remaining < maxTokenLength) { - const HashEntry* entry = m_globalData->keywords->getKeyword(*ident); + const HashTableValue* entry = m_vm->keywords->getKeyword(*ident); ASSERT((remaining < maxTokenLength) || !entry); if (!entry) return IDENT; @@ -696,8 +972,13 @@ template ALWAYS_INLINE JSTokenType Lexer::p return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword; } } + + bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction; + if (isPrivateName) + shift(); - const UChar* identifierStart = currentCharacter(); + const UChar* identifierStart = currentSourcePtr(); + int identifierLineStart = currentLineStartOffset(); UChar orAllChars = 0; @@ -707,7 +988,8 @@ template ALWAYS_INLINE JSTokenType Lexer::p } if (UNLIKELY(m_current == '\\')) { - setOffsetFromCharOffset(identifierStart); + ASSERT(!isPrivateName); + setOffsetFromSourcePtr(identifierStart, identifierLineStart); return parseIdentifierSlowCase(tokenData, lexerFlags, strictMode); } @@ -718,21 +1000,32 @@ template ALWAYS_INLINE JSTokenType Lexer::p const Identifier* ident = 0; - if (shouldCreateIdentifier) { - int identifierLength = currentCharacter() - identifierStart; + if (shouldCreateIdentifier || m_parsingBuiltinFunction) { + int identifierLength = currentSourcePtr() - identifierStart; if (isAll8Bit) ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength); else ident = makeIdentifier(identifierStart, identifierLength); - + if (m_parsingBuiltinFunction) { + if (!isSafeBuiltinIdentifier(*m_vm, ident) && !isPrivateName) { + m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions."); + return ERRORTOK; + } + if (isPrivateName) + ident = m_vm->propertyNames->getPrivateName(*ident); + else if (*ident == m_vm->propertyNames->undefinedKeyword) + tokenData->ident = &m_vm->propertyNames->undefinedPrivateName; + if (!ident) + return INVALID_PRIVATE_NAME_ERRORTOK; + } tokenData->ident = ident; } else tokenData->ident = 0; - if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords))) { + if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) && !isPrivateName) { ASSERT(shouldCreateIdentifier); if (remaining < maxTokenLength) { - const HashEntry* entry = m_globalData->keywords->getKeyword(*ident); + const HashTableValue* entry = m_vm->keywords->getKeyword(*ident); ASSERT((remaining < maxTokenLength) || !entry); if (!entry) return IDENT; @@ -745,11 +1038,9 @@ template ALWAYS_INLINE JSTokenType Lexer::p return IDENT; } -template -template JSTokenType Lexer::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode) +template template JSTokenType Lexer::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode) { - const ptrdiff_t remaining = m_codeEnd - m_code; - const T* identifierStart = currentCharacter(); + auto identifierStart = currentSourcePtr(); bool bufferRequired = false; while (true) { @@ -762,54 +1053,49 @@ template JSTokenType Lexer::parseIdentifierSlow // \uXXXX unicode characters. bufferRequired = true; - if (identifierStart != currentCharacter()) - m_buffer16.append(identifierStart, currentCharacter() - identifierStart); + if (identifierStart != currentSourcePtr()) + m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart); shift(); if (UNLIKELY(m_current != 'u')) - return ERRORTOK; + return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK; shift(); - int character = parseFourDigitUnicodeHex(); - if (UNLIKELY(character == -1)) - return ERRORTOK; - UChar ucharacter = static_cast(character); - if (UNLIKELY(m_buffer16.size() ? !isIdentPart(ucharacter) : !isIdentStart(ucharacter))) - return ERRORTOK; + auto character = parseUnicodeEscape(); + if (UNLIKELY(!character.isValid())) + return character.isIncomplete() ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK; + if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character.value()) : !isIdentStart(character.value()))) + return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK; if (shouldCreateIdentifier) - record16(ucharacter); - identifierStart = currentCharacter(); + recordUnicodeCodePoint(character.value()); + identifierStart = currentSourcePtr(); } int identifierLength; - const Identifier* ident = 0; + const Identifier* ident = nullptr; if (shouldCreateIdentifier) { if (!bufferRequired) { - identifierLength = currentCharacter() - identifierStart; + identifierLength = currentSourcePtr() - identifierStart; ident = makeIdentifier(identifierStart, identifierLength); } else { - if (identifierStart != currentCharacter()) - m_buffer16.append(identifierStart, currentCharacter() - identifierStart); + if (identifierStart != currentSourcePtr()) + m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart); ident = makeIdentifier(m_buffer16.data(), m_buffer16.size()); } tokenData->ident = ident; } else - tokenData->ident = 0; + tokenData->ident = nullptr; - if (LIKELY(!bufferRequired && !(lexerFlags & LexerFlagsIgnoreReservedWords))) { + m_buffer16.shrink(0); + + if (LIKELY(!(lexerFlags & LexerFlagsIgnoreReservedWords))) { ASSERT(shouldCreateIdentifier); - // Keywords must not be recognized if there was an \uXXXX in the identifier. - if (remaining < maxTokenLength) { - const HashEntry* entry = m_globalData->keywords->getKeyword(*ident); - ASSERT((remaining < maxTokenLength) || !entry); - if (!entry) - return IDENT; - JSTokenType token = static_cast(entry->lexerValue()); - return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT; - } - return IDENT; + const HashTableValue* entry = m_vm->keywords->getKeyword(*ident); + if (!entry) + return IDENT; + JSTokenType token = static_cast(entry->lexerValue()); + return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT; } - m_buffer16.resize(0); return IDENT; } @@ -824,24 +1110,25 @@ static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(UChar character) } template -template ALWAYS_INLINE bool Lexer::parseString(JSTokenData* tokenData, bool strictMode) +template ALWAYS_INLINE typename Lexer::StringParseResult Lexer::parseString(JSTokenData* tokenData, bool strictMode) { int startingOffset = currentOffset(); + int startingLineStartOffset = currentLineStartOffset(); int startingLineNumber = lineNumber(); T stringQuoteCharacter = m_current; shift(); - const T* stringStart = currentCharacter(); + const T* stringStart = currentSourcePtr(); while (m_current != stringQuoteCharacter) { if (UNLIKELY(m_current == '\\')) { - if (stringStart != currentCharacter() && shouldBuildStrings) - append8(stringStart, currentCharacter() - stringStart); + if (stringStart != currentSourcePtr() && shouldBuildStrings) + append8(stringStart, currentSourcePtr() - stringStart); shift(); - int escape = singleEscape(m_current); + LChar escape = singleEscape(m_current); - // Most common escape sequences first + // Most common escape sequences first. if (escape) { if (shouldBuildStrings) record8(escape); @@ -851,8 +1138,8 @@ template ALWAYS_INLINE bool Lexer::parseString(JSTo else if (m_current == 'x') { shift(); if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) { - m_lexErrorMessage = "\\x can only be followed by a hex character sequence"; - return false; + m_lexErrorMessage = ASCIILiteral("\\x can only be followed by a hex character sequence"); + return (atEnd() || (isASCIIHexDigit(m_current) && (m_code + 1 == m_codeEnd))) ? StringUnterminated : StringCannotBeParsed; } T prev = m_current; shift(); @@ -860,51 +1147,137 @@ template ALWAYS_INLINE bool Lexer::parseString(JSTo record8(convertHex(prev, m_current)); shift(); } else { - setOffset(startingOffset); + setOffset(startingOffset, startingLineStartOffset); setLineNumber(startingLineNumber); - m_buffer8.resize(0); + m_buffer8.shrink(0); return parseStringSlowCase(tokenData, strictMode); } - stringStart = currentCharacter(); + stringStart = currentSourcePtr(); continue; } if (UNLIKELY(characterRequiresParseStringSlowCase(m_current))) { - setOffset(startingOffset); + setOffset(startingOffset, startingLineStartOffset); setLineNumber(startingLineNumber); - m_buffer8.resize(0); + m_buffer8.shrink(0); return parseStringSlowCase(tokenData, strictMode); } shift(); } - if (currentCharacter() != stringStart && shouldBuildStrings) - append8(stringStart, currentCharacter() - stringStart); + if (currentSourcePtr() != stringStart && shouldBuildStrings) + append8(stringStart, currentSourcePtr() - stringStart); if (shouldBuildStrings) { tokenData->ident = makeIdentifier(m_buffer8.data(), m_buffer8.size()); - m_buffer8.resize(0); + m_buffer8.shrink(0); } else tokenData->ident = 0; - return true; + return StringParsedSuccessfully; } template -template bool Lexer::parseStringSlowCase(JSTokenData* tokenData, bool strictMode) +template ALWAYS_INLINE auto Lexer::parseComplexEscape(EscapeParseMode escapeParseMode, bool strictMode, T stringQuoteCharacter) -> StringParseResult +{ + if (m_current == 'x') { + shift(); + if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) { + m_lexErrorMessage = ASCIILiteral("\\x can only be followed by a hex character sequence"); + return StringCannotBeParsed; + } + T prev = m_current; + shift(); + if (shouldBuildStrings) + record16(convertHex(prev, m_current)); + shift(); + return StringParsedSuccessfully; + } + + if (m_current == 'u') { + shift(); + + if (escapeParseMode == EscapeParseMode::String && m_current == stringQuoteCharacter) { + if (shouldBuildStrings) + record16('u'); + return StringParsedSuccessfully; + } + + auto character = parseUnicodeEscape(); + if (character.isValid()) { + if (shouldBuildStrings) + recordUnicodeCodePoint(character.value()); + return StringParsedSuccessfully; + } + + m_lexErrorMessage = ASCIILiteral("\\u can only be followed by a Unicode character sequence"); + return character.isIncomplete() ? StringUnterminated : StringCannotBeParsed; + } + + if (strictMode) { + if (isASCIIDigit(m_current)) { + // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit. + int character1 = m_current; + shift(); + if (character1 != '0' || isASCIIDigit(m_current)) { + m_lexErrorMessage = ASCIILiteral("The only valid numeric escape in strict mode is '\\0'"); + return StringCannotBeParsed; + } + if (shouldBuildStrings) + record16(0); + return StringParsedSuccessfully; + } + } else { + if (isASCIIOctalDigit(m_current)) { + // Octal character sequences + T character1 = m_current; + shift(); + if (isASCIIOctalDigit(m_current)) { + // Two octal characters + T character2 = m_current; + shift(); + if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) { + if (shouldBuildStrings) + record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0'); + shift(); + } else { + if (shouldBuildStrings) + record16((character1 - '0') * 8 + character2 - '0'); + } + } else { + if (shouldBuildStrings) + record16(character1 - '0'); + } + return StringParsedSuccessfully; + } + } + + if (!atEnd()) { + if (shouldBuildStrings) + record16(m_current); + shift(); + return StringParsedSuccessfully; + } + + m_lexErrorMessage = ASCIILiteral("Unterminated string constant"); + return StringUnterminated; +} + +template +template auto Lexer::parseStringSlowCase(JSTokenData* tokenData, bool strictMode) -> StringParseResult { T stringQuoteCharacter = m_current; shift(); - const T* stringStart = currentCharacter(); + const T* stringStart = currentSourcePtr(); while (m_current != stringQuoteCharacter) { if (UNLIKELY(m_current == '\\')) { - if (stringStart != currentCharacter() && shouldBuildStrings) - append16(stringStart, currentCharacter() - stringStart); + if (stringStart != currentSourcePtr() && shouldBuildStrings) + append16(stringStart, currentSourcePtr() - stringStart); shift(); - int escape = singleEscape(m_current); + LChar escape = singleEscape(m_current); // Most common escape sequences first if (escape) { @@ -913,96 +1286,219 @@ template bool Lexer::parseStringSlowCase(JSTokenDat shift(); } else if (UNLIKELY(isLineTerminator(m_current))) shiftLineTerminator(); - else if (m_current == 'x') { - shift(); - if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) { - m_lexErrorMessage = "\\x can only be followed by a hex character sequence"; - return false; - } - T prev = m_current; - shift(); - if (shouldBuildStrings) - record16(convertHex(prev, m_current)); - shift(); - } else if (m_current == 'u') { - shift(); - int character = parseFourDigitUnicodeHex(); - if (character != -1) { - if (shouldBuildStrings) - record16(character); - } else if (m_current == stringQuoteCharacter) { - if (shouldBuildStrings) - record16('u'); - } else { - m_lexErrorMessage = "\\u can only be followed by a Unicode character sequence"; - return false; - } - } else if (strictMode && isASCIIDigit(m_current)) { - // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit. - int character1 = m_current; - shift(); - if (character1 != '0' || isASCIIDigit(m_current)) { - m_lexErrorMessage = "The only valid numeric escape in strict mode is '\\0'"; - return false; - } + else { + StringParseResult result = parseComplexEscape(EscapeParseMode::String, strictMode, stringQuoteCharacter); + if (result != StringParsedSuccessfully) + return result; + } + + stringStart = currentSourcePtr(); + continue; + } + // Fast check for characters that require special handling. + // Catches 0, \n, \r, 0x2028, and 0x2029 as efficiently + // as possible, and lets through all common ASCII characters. + if (UNLIKELY(((static_cast(m_current) - 0xE) & 0x2000))) { + // New-line or end of input is not allowed + if (atEnd() || isLineTerminator(m_current)) { + m_lexErrorMessage = ASCIILiteral("Unexpected EOF"); + return atEnd() ? StringUnterminated : StringCannotBeParsed; + } + // Anything else is just a normal character + } + shift(); + } + + if (currentSourcePtr() != stringStart && shouldBuildStrings) + append16(stringStart, currentSourcePtr() - stringStart); + if (shouldBuildStrings) + tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size()); + else + tokenData->ident = 0; + + m_buffer16.shrink(0); + return StringParsedSuccessfully; +} + +#if ENABLE(ES6_TEMPLATE_LITERAL_SYNTAX) +// While the lexer accepts (not ) sequence +// as one line terminator and increments one line number, +// TemplateLiteral considers it as two line terminators and . +// +// TemplateLiteral normalizes line terminators as follows. +// +// => +// => +// => +// <\u2028> => <\u2028> +// <\u2029> => <\u2029> +// +// So, should be normalized to . +// However, the lexer should increment the line number only once for . +// +// To achieve this, LineNumberAdder holds the current status of line terminator sequence. +// When TemplateLiteral lexer encounters a line terminator, it notifies to LineNumberAdder. +// LineNumberAdder maintains the status and increments the line number when it's necessary. +// For example, LineNumberAdder increments the line number only once for and . +template +class LineNumberAdder { +public: + LineNumberAdder(int& lineNumber) + : m_lineNumber(lineNumber) + { + } + + void clear() + { + m_previous = 0; + } + + void add(CharacterType character) + { + ASSERT(Lexer::isLineTerminator(character)); + if ((character + m_previous) == ('\n' + '\r')) + m_previous = 0; + else { + ++m_lineNumber; + m_previous = character; + } + } + +private: + int& m_lineNumber; + CharacterType m_previous { 0 }; +}; + +template +template typename Lexer::StringParseResult Lexer::parseTemplateLiteral(JSTokenData* tokenData, RawStringsBuildMode rawStringsBuildMode) +{ + const T* stringStart = currentSourcePtr(); + const T* rawStringStart = currentSourcePtr(); + + LineNumberAdder lineNumberAdder(m_lineNumber); + + while (m_current != '`') { + if (UNLIKELY(m_current == '\\')) { + lineNumberAdder.clear(); + if (stringStart != currentSourcePtr() && shouldBuildStrings) + append16(stringStart, currentSourcePtr() - stringStart); + shift(); + + LChar escape = singleEscape(m_current); + + // Most common escape sequences first. + if (escape) { if (shouldBuildStrings) - record16(0); - } else if (!strictMode && isASCIIOctalDigit(m_current)) { - // Octal character sequences - T character1 = m_current; + record16(escape); shift(); - if (isASCIIOctalDigit(m_current)) { - // Two octal characters - T character2 = m_current; + } else if (UNLIKELY(isLineTerminator(m_current))) { + if (m_current == '\r') { + lineNumberAdder.add(m_current); shift(); - if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) { - if (shouldBuildStrings) - record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0'); + if (m_current == '\n') { + lineNumberAdder.add(m_current); shift(); - } else { - if (shouldBuildStrings) - record16((character1 - '0') * 8 + character2 - '0'); } } else { - if (shouldBuildStrings) - record16(character1 - '0'); + lineNumberAdder.add(m_current); + shift(); } - } else if (!atEnd()) { - if (shouldBuildStrings) - record16(m_current); - shift(); } else { - m_lexErrorMessage = "Unterminated string constant"; - return false; + bool strictMode = true; + StringParseResult result = parseComplexEscape(EscapeParseMode::Template, strictMode, '`'); + if (result != StringParsedSuccessfully) + return result; } - stringStart = currentCharacter(); + stringStart = currentSourcePtr(); continue; } + + if (m_current == '$' && peek(1) == '{') + break; + // Fast check for characters that require special handling. // Catches 0, \n, \r, 0x2028, and 0x2029 as efficiently // as possible, and lets through all common ASCII characters. if (UNLIKELY(((static_cast(m_current) - 0xE) & 0x2000))) { - // New-line or end of input is not allowed - if (atEnd() || isLineTerminator(m_current)) { - m_lexErrorMessage = "Unexpected EOF"; - return false; + // End of input is not allowed. + // Unlike String, line terminator is allowed. + if (atEnd()) { + m_lexErrorMessage = ASCIILiteral("Unexpected EOF"); + return atEnd() ? StringUnterminated : StringCannotBeParsed; + } + + if (isLineTerminator(m_current)) { + if (m_current == '\r') { + // Normalize , to . + if (shouldBuildStrings) { + if (stringStart != currentSourcePtr()) + append16(stringStart, currentSourcePtr() - stringStart); + if (rawStringStart != currentSourcePtr() && rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings) + m_bufferForRawTemplateString16.append(rawStringStart, currentSourcePtr() - rawStringStart); + + record16('\n'); + if (rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings) + m_bufferForRawTemplateString16.append('\n'); + } + lineNumberAdder.add(m_current); + shift(); + if (m_current == '\n') { + lineNumberAdder.add(m_current); + shift(); + } + stringStart = currentSourcePtr(); + rawStringStart = currentSourcePtr(); + } else { + lineNumberAdder.add(m_current); + shift(); + } + continue; } // Anything else is just a normal character } + + lineNumberAdder.clear(); shift(); } - if (currentCharacter() != stringStart && shouldBuildStrings) - append16(stringStart, currentCharacter() - stringStart); - if (shouldBuildStrings) - tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size()); - else - tokenData->ident = 0; + bool isTail = m_current == '`'; - m_buffer16.resize(0); - return true; + if (shouldBuildStrings) { + if (currentSourcePtr() != stringStart) + append16(stringStart, currentSourcePtr() - stringStart); + if (rawStringStart != currentSourcePtr() && rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings) + m_bufferForRawTemplateString16.append(rawStringStart, currentSourcePtr() - rawStringStart); + } + + if (shouldBuildStrings) { + tokenData->cooked = makeIdentifier(m_buffer16.data(), m_buffer16.size()); + // Line terminator normalization (e.g. => ) should be applied to both the raw and cooked representations. + if (rawStringsBuildMode == RawStringsBuildMode::BuildRawStrings) + tokenData->raw = makeIdentifier(m_bufferForRawTemplateString16.data(), m_bufferForRawTemplateString16.size()); + else + tokenData->raw = makeEmptyIdentifier(); + } else { + tokenData->cooked = makeEmptyIdentifier(); + tokenData->raw = makeEmptyIdentifier(); + } + tokenData->isTail = isTail; + + m_buffer16.shrink(0); + m_bufferForRawTemplateString16.shrink(0); + + if (isTail) { + // Skip ` + shift(); + } else { + // Skip $ and { + shift(); + shift(); + } + + return StringParsedSuccessfully; } +#endif template ALWAYS_INLINE void Lexer::parseHex(double& returnValue) @@ -1011,9 +1507,6 @@ ALWAYS_INLINE void Lexer::parseHex(double& returnValue) uint32_t hexValue = 0; int maximumDigits = 7; - // Shift out the 'x' prefix. - shift(); - do { hexValue = (hexValue << 4) + toASCIIHexValue(m_current); shift(); @@ -1044,29 +1537,68 @@ ALWAYS_INLINE void Lexer::parseHex(double& returnValue) returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16); } +template +ALWAYS_INLINE bool Lexer::parseBinary(double& returnValue) +{ + // Optimization: most binary values fit into 4 bytes. + uint32_t binaryValue = 0; + const unsigned maximumDigits = 32; + int digit = maximumDigits - 1; + // Temporary buffer for the digits. Makes easier + // to reconstruct the input characters when needed. + LChar digits[maximumDigits]; + + do { + binaryValue = (binaryValue << 1) + (m_current - '0'); + digits[digit] = m_current; + shift(); + --digit; + } while (isASCIIBinaryDigit(m_current) && digit >= 0); + + if (!isASCIIDigit(m_current) && digit >= 0) { + returnValue = binaryValue; + return true; + } + + for (int i = maximumDigits - 1; i > digit; --i) + record8(digits[i]); + + while (isASCIIBinaryDigit(m_current)) { + record8(m_current); + shift(); + } + + if (isASCIIDigit(m_current)) + return false; + + returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 2); + return true; +} + template ALWAYS_INLINE bool Lexer::parseOctal(double& returnValue) { // Optimization: most octal values fit into 4 bytes. uint32_t octalValue = 0; - int maximumDigits = 9; + const unsigned maximumDigits = 10; + int digit = maximumDigits - 1; // Temporary buffer for the digits. Makes easier // to reconstruct the input characters when needed. - LChar digits[10]; + LChar digits[maximumDigits]; do { octalValue = octalValue * 8 + (m_current - '0'); - digits[maximumDigits] = m_current; + digits[digit] = m_current; shift(); - --maximumDigits; - } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0); + --digit; + } while (isASCIIOctalDigit(m_current) && digit >= 0); - if (!isASCIIDigit(m_current) && maximumDigits >= 0) { + if (!isASCIIDigit(m_current) && digit >= 0) { returnValue = octalValue; return true; } - for (int i = 9; i > maximumDigits; --i) + for (int i = maximumDigits - 1; i > digit; --i) record8(digits[i]); while (isASCIIOctalDigit(m_current)) { @@ -1090,24 +1622,25 @@ ALWAYS_INLINE bool Lexer::parseDecimal(double& returnValue) // Since parseOctal may be executed before parseDecimal, // the m_buffer8 may hold ascii digits. if (!m_buffer8.size()) { - int maximumDigits = 9; + const unsigned maximumDigits = 10; + int digit = maximumDigits - 1; // Temporary buffer for the digits. Makes easier // to reconstruct the input characters when needed. - LChar digits[10]; + LChar digits[maximumDigits]; do { decimalValue = decimalValue * 10 + (m_current - '0'); - digits[maximumDigits] = m_current; + digits[digit] = m_current; shift(); - --maximumDigits; - } while (isASCIIDigit(m_current) && maximumDigits >= 0); + --digit; + } while (isASCIIDigit(m_current) && digit >= 0); - if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') { + if (digit >= 0 && m_current != '.' && (m_current | 0x20) != 'e') { returnValue = decimalValue; return true; } - for (int i = 9; i > maximumDigits; --i) + for (int i = maximumDigits - 1; i > digit; --i) record8(digits[i]); } @@ -1182,9 +1715,25 @@ bool Lexer::nextTokenIsColon() return code < m_codeEnd && *code == ':'; } +#if ENABLE(ES6_ARROWFUNCTION_SYNTAX) +template +void Lexer::setTokenPosition(JSToken* tokenRecord) +{ + JSTokenData* tokenData = &tokenRecord->m_data; + tokenData->line = lineNumber(); + tokenData->offset = currentOffset(); + tokenData->lineStartOffset = currentLineStartOffset(); + ASSERT(tokenData->offset >= tokenData->lineStartOffset); +} +#endif + template -JSTokenType Lexer::lex(JSTokenData* tokenData, JSTokenInfo* tokenInfo, unsigned lexerFlags, bool strictMode) +JSTokenType Lexer::lex(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode) { + JSTokenData* tokenData = &tokenRecord->m_data; + JSTokenLocation* tokenLocation = &tokenRecord->m_location; + m_lastTockenLocation = JSTokenLocation(tokenRecord->m_location); + ASSERT(!m_error); ASSERT(m_buffer8.isEmpty()); ASSERT(m_buffer16.isEmpty()); @@ -1199,7 +1748,9 @@ start: if (atEnd()) return EOFTOK; - tokenInfo->startOffset = currentOffset(); + tokenLocation->startOffset = currentOffset(); + ASSERT(currentOffset() >= currentLineStartOffset()); + tokenRecord->m_startPosition = currentPosition(); CharacterType type; if (LIKELY(isLatin1(m_current))) @@ -1241,7 +1792,19 @@ start: } token = GT; break; - case CharacterEqual: + case CharacterEqual: { +#if ENABLE(ES6_ARROWFUNCTION_SYNTAX) + if (peek(1) == '>') { + token = ARROWFUNCTION; + tokenData->line = lineNumber(); + tokenData->offset = currentOffset(); + tokenData->lineStartOffset = currentLineStartOffset(); + ASSERT(tokenData->offset >= tokenData->lineStartOffset); + shift(); + shift(); + break; + } +#endif shift(); if (m_current == '=') { shift(); @@ -1255,6 +1818,7 @@ start: } token = EQUAL; break; + } case CharacterLess: shift(); if (m_current == '!' && peek(1) == '-' && peek(2) == '-') { @@ -1343,7 +1907,8 @@ start: shift(); if (parseMultilineComment()) goto start; - m_lexErrorMessage = "Multiline comment was not closed properly"; + m_lexErrorMessage = ASCIILiteral("Multiline comment was not closed properly"); + token = UNTERMINATED_MULTILINE_COMMENT_ERRORTOK; goto returnError; } if (m_current == '=') { @@ -1436,82 +2001,181 @@ start: token = SEMICOLON; break; case CharacterOpenBrace: - tokenData->intValue = currentOffset(); + tokenData->line = lineNumber(); + tokenData->offset = currentOffset(); + tokenData->lineStartOffset = currentLineStartOffset(); + ASSERT(tokenData->offset >= tokenData->lineStartOffset); shift(); token = OPENBRACE; break; case CharacterCloseBrace: - tokenData->intValue = currentOffset(); + tokenData->line = lineNumber(); + tokenData->offset = currentOffset(); + tokenData->lineStartOffset = currentLineStartOffset(); + ASSERT(tokenData->offset >= tokenData->lineStartOffset); shift(); token = CLOSEBRACE; break; case CharacterDot: shift(); if (!isASCIIDigit(m_current)) { + if (UNLIKELY((m_current == '.') && (peek(1) == '.'))) { + shift(); + shift(); + token = DOTDOTDOT; + break; + } token = DOT; break; } goto inNumberAfterDecimalPoint; case CharacterZero: shift(); - if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) { + if ((m_current | 0x20) == 'x') { + if (!isASCIIHexDigit(peek(1))) { + m_lexErrorMessage = ASCIILiteral("No hexadecimal digits after '0x'"); + token = INVALID_HEX_NUMBER_ERRORTOK; + goto returnError; + } + + // Shift out the 'x' prefix. + shift(); + parseHex(tokenData->doubleValue); - token = NUMBER; - } else { - record8('0'); - if (isASCIIOctalDigit(m_current)) { - if (parseOctal(tokenData->doubleValue)) { - if (strictMode) { - m_lexErrorMessage = "Octal escapes are forbidden in strict mode"; - goto returnError; - } - token = NUMBER; - } + if (isIdentStart(m_current)) { + m_lexErrorMessage = ASCIILiteral("No space between hexadecimal literal and identifier"); + token = INVALID_HEX_NUMBER_ERRORTOK; + goto returnError; } + token = tokenTypeForIntegerLikeToken(tokenData->doubleValue); + m_buffer8.shrink(0); + break; } - // Fall through into CharacterNumber + if ((m_current | 0x20) == 'b') { + if (!isASCIIBinaryDigit(peek(1))) { + m_lexErrorMessage = ASCIILiteral("No binary digits after '0b'"); + token = INVALID_BINARY_NUMBER_ERRORTOK; + goto returnError; + } + + // Shift out the 'b' prefix. + shift(); + + parseBinary(tokenData->doubleValue); + if (isIdentStart(m_current)) { + m_lexErrorMessage = ASCIILiteral("No space between binary literal and identifier"); + token = INVALID_BINARY_NUMBER_ERRORTOK; + goto returnError; + } + token = tokenTypeForIntegerLikeToken(tokenData->doubleValue); + m_buffer8.shrink(0); + break; + } + + if ((m_current | 0x20) == 'o') { + if (!isASCIIOctalDigit(peek(1))) { + m_lexErrorMessage = ASCIILiteral("No octal digits after '0o'"); + token = INVALID_OCTAL_NUMBER_ERRORTOK; + goto returnError; + } + + // Shift out the 'o' prefix. + shift(); + + parseOctal(tokenData->doubleValue); + if (isIdentStart(m_current)) { + m_lexErrorMessage = ASCIILiteral("No space between octal literal and identifier"); + token = INVALID_OCTAL_NUMBER_ERRORTOK; + goto returnError; + } + token = tokenTypeForIntegerLikeToken(tokenData->doubleValue); + m_buffer8.shrink(0); + break; + } + + record8('0'); + if (strictMode && isASCIIDigit(m_current)) { + m_lexErrorMessage = ASCIILiteral("Decimal integer literals with a leading zero are forbidden in strict mode"); + token = INVALID_OCTAL_NUMBER_ERRORTOK; + goto returnError; + } + if (isASCIIOctalDigit(m_current)) { + if (parseOctal(tokenData->doubleValue)) { + token = tokenTypeForIntegerLikeToken(tokenData->doubleValue); + } + } + FALLTHROUGH; case CharacterNumber: - if (LIKELY(token != NUMBER)) { + if (LIKELY(token != INTEGER && token != DOUBLE)) { if (!parseDecimal(tokenData->doubleValue)) { + token = INTEGER; if (m_current == '.') { shift(); inNumberAfterDecimalPoint: parseNumberAfterDecimalPoint(); + token = DOUBLE; } if ((m_current | 0x20) == 'e') { if (!parseNumberAfterExponentIndicator()) { - m_lexErrorMessage = "Non-number found after exponent indicator"; + m_lexErrorMessage = ASCIILiteral("Non-number found after exponent indicator"); + token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK; goto returnError; } } size_t parsedLength; tokenData->doubleValue = parseDouble(m_buffer8.data(), m_buffer8.size(), parsedLength); - } - token = NUMBER; + if (token == INTEGER) + token = tokenTypeForIntegerLikeToken(tokenData->doubleValue); + } else + token = tokenTypeForIntegerLikeToken(tokenData->doubleValue); } // No identifiers allowed directly after numeric literal, e.g. "3in" is bad. if (UNLIKELY(isIdentStart(m_current))) { - m_lexErrorMessage = "At least one digit must occur after a decimal point"; + m_lexErrorMessage = ASCIILiteral("At least one digit must occur after a decimal point"); + token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK; goto returnError; } - m_buffer8.resize(0); + m_buffer8.shrink(0); break; - case CharacterQuote: - if (lexerFlags & LexerFlagsDontBuildStrings) { - if (UNLIKELY(!parseString(tokenData, strictMode))) - goto returnError; - } else { - if (UNLIKELY(!parseString(tokenData, strictMode))) - goto returnError; + case CharacterQuote: { + StringParseResult result = StringCannotBeParsed; + if (lexerFlags & LexerFlagsDontBuildStrings) + result = parseString(tokenData, strictMode); + else + result = parseString(tokenData, strictMode); + + if (UNLIKELY(result != StringParsedSuccessfully)) { + token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK; + goto returnError; } shift(); token = STRING; break; + } +#if ENABLE(ES6_TEMPLATE_LITERAL_SYNTAX) + case CharacterBackQuote: { + // Skip backquote. + shift(); + StringParseResult result = StringCannotBeParsed; + if (lexerFlags & LexerFlagsDontBuildStrings) + result = parseTemplateLiteral(tokenData, RawStringsBuildMode::BuildRawStrings); + else + result = parseTemplateLiteral(tokenData, RawStringsBuildMode::BuildRawStrings); + + if (UNLIKELY(result != StringParsedSuccessfully)) { + token = result == StringUnterminated ? UNTERMINATED_TEMPLATE_LITERAL_ERRORTOK : INVALID_TEMPLATE_LITERAL_ERRORTOK; + goto returnError; + } + token = TEMPLATE; + break; + } +#endif case CharacterIdentifierStart: ASSERT(isIdentStart(m_current)); - // Fall through into CharacterBackSlash. + FALLTHROUGH; case CharacterBackSlash: + parseIdent: if (lexerFlags & LexexFlagsDontBuildKeywords) token = parseIdentifier(tokenData, lexerFlags, strictMode); else @@ -1522,13 +2186,21 @@ inNumberAfterDecimalPoint: shiftLineTerminator(); m_atLineStart = true; m_terminator = true; + m_lineStart = m_code; goto start; + case CharacterPrivateIdentifierStart: + if (m_parsingBuiltinFunction) + goto parseIdent; + + FALLTHROUGH; case CharacterInvalid: m_lexErrorMessage = invalidCharacterMessage(); + token = ERRORTOK; goto returnError; default: - ASSERT_NOT_REACHED(); - m_lexErrorMessage = "Internal Error"; + RELEASE_ASSERT_NOT_REACHED(); + m_lexErrorMessage = ASCIILiteral("Internal Error"); + token = ERRORTOK; goto returnError; } @@ -1544,6 +2216,7 @@ inSingleLineComment: shiftLineTerminator(); m_atLineStart = true; m_terminator = true; + m_lineStart = m_code; if (!lastTokenWasRestrKeyword()) goto start; @@ -1551,16 +2224,35 @@ inSingleLineComment: // Fall through into returnToken. returnToken: - tokenInfo->line = m_lineNumber; - tokenInfo->endOffset = currentOffset(); + tokenLocation->line = m_lineNumber; + tokenLocation->endOffset = currentOffset(); + tokenLocation->lineStartOffset = currentLineStartOffset(); + ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset); + tokenRecord->m_endPosition = currentPosition(); m_lastToken = token; return token; returnError: m_error = true; - tokenInfo->line = m_lineNumber; - tokenInfo->endOffset = currentOffset(); - return ERRORTOK; + tokenLocation->line = m_lineNumber; + tokenLocation->endOffset = currentOffset(); + tokenLocation->lineStartOffset = currentLineStartOffset(); + ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset); + tokenRecord->m_endPosition = currentPosition(); + RELEASE_ASSERT(token & ErrorTokenFlag); + return token; +} + +template +static inline void orCharacter(UChar&, UChar); + +template <> +inline void orCharacter(UChar&, UChar) { } + +template <> +inline void orCharacter(UChar& orAccumulator, UChar character) +{ + orAccumulator |= character; } template @@ -1570,6 +2262,7 @@ bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, bool lastWasEscape = false; bool inBrackets = false; + UChar charactersOredTogether = 0; if (patternPrefix) { ASSERT(!isLineTerminator(patternPrefix)); @@ -1580,7 +2273,7 @@ bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, while (true) { if (isLineTerminator(m_current) || atEnd()) { - m_buffer16.resize(0); + m_buffer16.shrink(0); return false; } @@ -1592,6 +2285,7 @@ bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, break; record16(prev); + orCharacter(charactersOredTogether, prev); if (lastWasEscape) { lastWasEscape = false; @@ -1611,16 +2305,19 @@ bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, } } - pattern = makeIdentifier(m_buffer16.data(), m_buffer16.size()); - m_buffer16.resize(0); + pattern = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether); + + m_buffer16.shrink(0); + charactersOredTogether = 0; while (isIdentPart(m_current)) { record16(m_current); + orCharacter(charactersOredTogether, m_current); shift(); } - flags = makeIdentifier(m_buffer16.data(), m_buffer16.size()); - m_buffer16.resize(0); + flags = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether); + m_buffer16.shrink(0); return true; } @@ -1666,6 +2363,40 @@ bool Lexer::skipRegExp() return true; } +#if ENABLE(ES6_TEMPLATE_LITERAL_SYNTAX) +template +JSTokenType Lexer::scanTrailingTemplateString(JSToken* tokenRecord, RawStringsBuildMode rawStringsBuildMode) +{ + JSTokenData* tokenData = &tokenRecord->m_data; + JSTokenLocation* tokenLocation = &tokenRecord->m_location; + ASSERT(!m_error); + ASSERT(m_buffer16.isEmpty()); + + // Leading closing brace } is already shifted in the previous token scan. + // So in this re-scan phase, shift() is not needed here. + StringParseResult result = parseTemplateLiteral(tokenData, rawStringsBuildMode); + JSTokenType token = ERRORTOK; + if (UNLIKELY(result != StringParsedSuccessfully)) { + token = result == StringUnterminated ? UNTERMINATED_TEMPLATE_LITERAL_ERRORTOK : INVALID_TEMPLATE_LITERAL_ERRORTOK; + m_error = true; + } else { + token = TEMPLATE; + m_lastToken = token; + } + + // Since TemplateString always ends with ` or }, m_atLineStart always becomes false. + m_atLineStart = false; + + // Adjust current tokenLocation data for TemplateString. + tokenLocation->line = m_lineNumber; + tokenLocation->endOffset = currentOffset(); + tokenLocation->lineStartOffset = currentLineStartOffset(); + ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset); + tokenRecord->m_endPosition = currentPosition(); + return token; +} +#endif + template void Lexer::clear() { @@ -1677,15 +2408,10 @@ void Lexer::clear() Vector newBuffer16; m_buffer16.swap(newBuffer16); - m_isReparsing = false; -} + Vector newBufferForRawTemplateString16; + m_bufferForRawTemplateString16.swap(newBufferForRawTemplateString16); -template -SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine) -{ - ASSERT((*m_source->provider()->data())[openBrace] == '{'); - ASSERT((*m_source->provider()->data())[closeBrace] == '}'); - return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine); + m_isReparsing = false; } // Instantiate the two flavors of Lexer we need instead of putting most of this file in Lexer.h