parser/Lexer.h

   1 /*
   2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   3  *  Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved.
   4  *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
   5  *
   6  *  This library is free software; you can redistribute it and/or
   7  *  modify it under the terms of the GNU Library General Public
   8  *  License as published by the Free Software Foundation; either
   9  *  version 2 of the License, or (at your option) any later version.
  10  *
  11  *  This library is distributed in the hope that it will be useful,
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  *  Library General Public License for more details.
  15  *
  16  *  You should have received a copy of the GNU Library General Public License
  17  *  along with this library; see the file COPYING.LIB.  If not, write to
  18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19  *  Boston, MA 02110-1301, USA.
  20  *
  21  */
  22
  23 #ifndef Lexer_h
  24 #define Lexer_h
  25
  26 #include "Lookup.h"
  27 #include "ParserArena.h"
  28 #include "ParserTokens.h"
  29 #include "SourceCode.h"
  30 #include <wtf/ASCIICType.h>
  31 #include <wtf/SegmentedVector.h>
  32 #include <wtf/Vector.h>
  33
  34 namespace JSC {
  35
  36 class Keywords {
  37 public:
  38     bool isKeyword(const Identifier& ident) const
  39     {
  40         return m_keywordTable.entry(ident);
  41     }
  42
  43     const HashTableValue* getKeyword(const Identifier& ident) const
  44     {
  45         return m_keywordTable.entry(ident);
  46     }
  47
  48     explicit Keywords(VM&);
  49
  50     ~Keywords()
  51     {
  52         m_keywordTable.deleteTable();
  53     }
  54
  55 private:
  56     friend class VM;
  57
  58     VM& m_vm;
  59     const HashTable m_keywordTable;
  60 };
  61
  62 enum LexerFlags {
  63     LexerFlagsIgnoreReservedWords = 1,
  64     LexerFlagsDontBuildStrings = 2,
  65     LexexFlagsDontBuildKeywords = 4
  66 };
  67
  68 struct ParsedUnicodeEscapeValue;
  69
  70 template <typename T>
  71 class Lexer {
  72     WTF_MAKE_NONCOPYABLE(Lexer);
  73     WTF_MAKE_FAST_ALLOCATED;
  74
  75 public:
  76     Lexer(VM*, JSParserBuiltinMode);
  77     ~Lexer();
  78
  79     // Character manipulation functions.
  80     static bool isWhiteSpace(T character);
  81     static bool isLineTerminator(T character);
  82     static unsigned char convertHex(int c1, int c2);
  83     static UChar convertUnicode(int c1, int c2, int c3, int c4);
  84
  85     // Functions to set up parsing.
  86     void setCode(const SourceCode&, ParserArena*);
  87     void setIsReparsing() { m_isReparsing = true; }
  88     bool isReparsing() const { return m_isReparsing; }
  89
  90 #if ENABLE(ES6_ARROWFUNCTION_SYNTAX)
  91     void setTokenPosition(JSToken* tokenRecord);
  92 #endif
  93     JSTokenType lex(JSToken*, unsigned, bool strictMode);
  94     bool nextTokenIsColon();
  95     int lineNumber() const { return m_lineNumber; }
  96     ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
  97     ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
  98     ALWAYS_INLINE JSTextPosition currentPosition() const
  99     {
 100         return JSTextPosition(m_lineNumber, currentOffset(), currentLineStartOffset());
 101     }
 102     JSTextPosition positionBeforeLastNewline() const { return m_positionBeforeLastNewline; }
 103     JSTokenLocation lastTokenLocation() const { return m_lastTockenLocation; }
 104     void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
 105     int lastLineNumber() const { return m_lastLineNumber; }
 106     bool prevTerminator() const { return m_terminator; }
 107     bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
 108 #if ENABLE(ES6_TEMPLATE_LITERAL_SYNTAX)
 109     enum class RawStringsBuildMode { BuildRawStrings, DontBuildRawStrings };
 110     JSTokenType scanTrailingTemplateString(JSToken*, RawStringsBuildMode);
 111 #endif
 112     bool skipRegExp();
 113
 114     // Functions for use after parsing.
 115     bool sawError() const { return m_error; }
 116     String getErrorMessage() const { return m_lexErrorMessage; }
 117     void clear();
 118     void setOffset(int offset, int lineStartOffset)
 119     {
 120         m_error = 0;
 121         m_lexErrorMessage = String();
 122
 123         m_code = sourcePtrFromOffset(offset);
 124         m_lineStart = sourcePtrFromOffset(lineStartOffset);
 125         ASSERT(currentOffset() >= currentLineStartOffset());
 126
 127         m_buffer8.resize(0);
 128         m_buffer16.resize(0);
 129         if (LIKELY(m_code < m_codeEnd))
 130             m_current = *m_code;
 131         else
 132             m_current = 0;
 133     }
 134     void setLineNumber(int line)
 135     {
 136         m_lineNumber = line;
 137     }
 138     void setTerminator(bool terminator)
 139     {
 140         m_terminator = terminator;
 141     }
 142
 143     SourceProvider* sourceProvider() const { return m_source->provider(); }
 144
 145     JSTokenType lexExpectIdentifier(JSToken*, unsigned, bool strictMode);
 146
 147 private:
 148     void record8(int);
 149     void append8(const T*, size_t);
 150     void record16(int);
 151     void record16(T);
 152     void recordUnicodeCodePoint(UChar32);
 153     void append16(const LChar*, size_t);
 154     void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
 155
 156     ALWAYS_INLINE void shift();
 157     ALWAYS_INLINE bool atEnd() const;
 158     ALWAYS_INLINE T peek(int offset) const;
 159
 160     ParsedUnicodeEscapeValue parseUnicodeEscape();
 161     void shiftLineTerminator();
 162
 163     ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
 164     ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
 165
 166     String invalidCharacterMessage() const;
 167     ALWAYS_INLINE const T* currentSourcePtr() const;
 168     ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
 169
 170     ALWAYS_INLINE void setCodeStart(const StringImpl*);
 171
 172     ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
 173     ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
 174     ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
 175     ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
 176     ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
 177     ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
 178     ALWAYS_INLINE const Identifier* makeEmptyIdentifier();
 179
 180     ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
 181
 182     template <int shiftAmount> void internalShift();
 183     template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
 184     template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
 185     template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
 186     enum StringParseResult {
 187         StringParsedSuccessfully,
 188         StringUnterminated,
 189         StringCannotBeParsed
 190     };
 191     template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode);
 192     template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode);
 193
 194     enum class EscapeParseMode { Template, String };
 195     template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseComplexEscape(EscapeParseMode, bool strictMode, T stringQuoteCharacter);
 196 #if ENABLE(ES6_TEMPLATE_LITERAL_SYNTAX)
 197     template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseTemplateLiteral(JSTokenData*, RawStringsBuildMode);
 198 #endif
 199     ALWAYS_INLINE void parseHex(double& returnValue);
 200     ALWAYS_INLINE bool parseBinary(double& returnValue);
 201     ALWAYS_INLINE bool parseOctal(double& returnValue);
 202     ALWAYS_INLINE bool parseDecimal(double& returnValue);
 203     ALWAYS_INLINE void parseNumberAfterDecimalPoint();
 204     ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
 205     ALWAYS_INLINE bool parseMultilineComment();
 206
 207     static const size_t initialReadBufferCapacity = 32;
 208
 209     int m_lineNumber;
 210     int m_lastLineNumber;
 211
 212     Vector<LChar> m_buffer8;
 213     Vector<UChar> m_buffer16;
 214     Vector<UChar> m_bufferForRawTemplateString16;
 215     bool m_terminator;
 216     int m_lastToken;
 217
 218     const SourceCode* m_source;
 219     unsigned m_sourceOffset;
 220     const T* m_code;
 221     const T* m_codeStart;
 222     const T* m_codeEnd;
 223     const T* m_codeStartPlusOffset;
 224     const T* m_lineStart;
 225     JSTextPosition m_positionBeforeLastNewline;
 226     JSTokenLocation m_lastTockenLocation;
 227     bool m_isReparsing;
 228     bool m_atLineStart;
 229     bool m_error;
 230     String m_lexErrorMessage;
 231
 232     T m_current;
 233
 234     IdentifierArena* m_arena;
 235
 236     VM* m_vm;
 237     bool m_parsingBuiltinFunction;
 238 };
 239
 240 template <>
 241 ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
 242 {
 243     return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
 244 }
 245
 246 template <>
 247 ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
 248 {
 249     // 0x180E used to be in Zs category before Unicode 6.3, and EcmaScript says that we should keep treating it as such.
 250     return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (u_charType(ch) == U_SPACE_SEPARATOR || ch == 0x180E || ch == 0xFEFF);
 251 }
 252
 253 template <>
 254 ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
 255 {
 256     return ch == '\r' || ch == '\n';
 257 }
 258
 259 template <>
 260 ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
 261 {
 262     return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
 263 }
 264
 265 template <typename T>
 266 inline unsigned char Lexer<T>::convertHex(int c1, int c2)
 267 {
 268     return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
 269 }
 270
 271 template <typename T>
 272 inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
 273 {
 274     return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
 275 }
 276
 277 template <typename T>
 278 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
 279 {
 280     return &m_arena->makeIdentifier(m_vm, characters, length);
 281 }
 282
 283 template <typename T>
 284 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
 285 {
 286     return &m_arena->makeIdentifier(m_vm, characters, length);
 287 }
 288
 289 template <>
 290 ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
 291 {
 292     return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 293 }
 294
 295 template <>
 296 ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
 297 {
 298     if (!(orAllChars & ~0xff))
 299         return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 300
 301     return &m_arena->makeIdentifier(m_vm, characters, length);
 302 }
 303
 304 template <typename T>
 305 ALWAYS_INLINE const Identifier* Lexer<T>::makeEmptyIdentifier()
 306 {
 307     return &m_arena->makeEmptyIdentifier(m_vm);
 308 }
 309
 310 template <>
 311 ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
 312 {
 313     ASSERT(sourceString->is8Bit());
 314     m_codeStart = sourceString->characters8();
 315 }
 316
 317 template <>
 318 ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
 319 {
 320     ASSERT(!sourceString->is8Bit());
 321     m_codeStart = sourceString->characters16();
 322 }
 323
 324 template <typename T>
 325 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
 326 {
 327     return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 328 }
 329
 330 template <typename T>
 331 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
 332 {
 333     return &m_arena->makeIdentifier(m_vm, characters, length);
 334 }
 335
 336 template <typename T>
 337 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
 338 {
 339     return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 340 }
 341
 342 #if ASSERT_DISABLED
 343 ALWAYS_INLINE bool isSafeBuiltinIdentifier(VM&, const Identifier*) { return true; }
 344 #else
 345 bool isSafeBuiltinIdentifier(VM&, const Identifier*);
 346 #endif
 347
 348 template <typename T>
 349 ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
 350 {
 351     JSTokenData* tokenData = &tokenRecord->m_data;
 352     JSTokenLocation* tokenLocation = &tokenRecord->m_location;
 353     ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
 354     const T* start = m_code;
 355     const T* ptr = start;
 356     const T* end = m_codeEnd;
 357     JSTextPosition startPosition = currentPosition();
 358     if (ptr >= end) {
 359         ASSERT(ptr == end);
 360         goto slowCase;
 361     }
 362     if (!WTF::isASCIIAlpha(*ptr))
 363         goto slowCase;
 364     ++ptr;
 365     while (ptr < end) {
 366         if (!WTF::isASCIIAlphanumeric(*ptr))
 367             break;
 368         ++ptr;
 369     }
 370
 371     // Here's the shift
 372     if (ptr < end) {
 373         if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
 374             goto slowCase;
 375         m_current = *ptr;
 376     } else
 377         m_current = 0;
 378
 379     m_code = ptr;
 380     ASSERT(currentOffset() >= currentLineStartOffset());
 381
 382     // Create the identifier if needed
 383     if (lexerFlags & LexexFlagsDontBuildKeywords
 384 #if !ASSERT_DISABLED
 385         && !m_parsingBuiltinFunction
 386 #endif
 387         )
 388         tokenData->ident = 0;
 389     else
 390         tokenData->ident = makeLCharIdentifier(start, ptr - start);
 391
 392     tokenLocation->line = m_lineNumber;
 393     tokenLocation->lineStartOffset = currentLineStartOffset();
 394     tokenLocation->startOffset = offsetFromSourcePtr(start);
 395     tokenLocation->endOffset = currentOffset();
 396     ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
 397     tokenRecord->m_startPosition = startPosition;
 398     tokenRecord->m_endPosition = currentPosition();
 399 #if !ASSERT_DISABLED
 400     if (m_parsingBuiltinFunction) {
 401         if (!isSafeBuiltinIdentifier(*m_vm, tokenData->ident))
 402             return ERRORTOK;
 403     }
 404 #endif
 405
 406     m_lastToken = IDENT;
 407     return IDENT;
 408
 409 slowCase:
 410     return lex(tokenRecord, lexerFlags, strictMode);
 411 }
 412
 413 } // namespace JSC
 414
 415 #endif // Lexer_h