parser/Lexer.h

   1 /*
   2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   3  *  Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved.
   4  *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
   5  *
   6  *  This library is free software; you can redistribute it and/or
   7  *  modify it under the terms of the GNU Library General Public
   8  *  License as published by the Free Software Foundation; either
   9  *  version 2 of the License, or (at your option) any later version.
  10  *
  11  *  This library is distributed in the hope that it will be useful,
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  *  Library General Public License for more details.
  15  *
  16  *  You should have received a copy of the GNU Library General Public License
  17  *  along with this library; see the file COPYING.LIB.  If not, write to
  18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19  *  Boston, MA 02110-1301, USA.
  20  *
  21  */
  22
  23 #ifndef Lexer_h
  24 #define Lexer_h
  25
  26 #include "Lookup.h"
  27 #include "ParserArena.h"
  28 #include "ParserTokens.h"
  29 #include "SourceCode.h"
  30 #include <wtf/ASCIICType.h>
  31 #include <wtf/SegmentedVector.h>
  32 #include <wtf/Vector.h>
  33 #include <wtf/unicode/Unicode.h>
  34
  35 namespace JSC {
  36
  37 class Keywords {
  38 public:
  39     bool isKeyword(const Identifier& ident) const
  40     {
  41         return m_keywordTable.entry(m_vm, ident);
  42     }
  43
  44     const HashEntry* getKeyword(const Identifier& ident) const
  45     {
  46         return m_keywordTable.entry(m_vm, ident);
  47     }
  48
  49     ~Keywords()
  50     {
  51         m_keywordTable.deleteTable();
  52     }
  53
  54 private:
  55     friend class VM;
  56
  57     Keywords(VM*);
  58
  59     VM* m_vm;
  60     const HashTable m_keywordTable;
  61 };
  62
  63 enum LexerFlags {
  64     LexerFlagsIgnoreReservedWords = 1,
  65     LexerFlagsDontBuildStrings = 2,
  66     LexexFlagsDontBuildKeywords = 4
  67 };
  68
  69 template <typename T>
  70 class Lexer {
  71     WTF_MAKE_NONCOPYABLE(Lexer);
  72     WTF_MAKE_FAST_ALLOCATED;
  73
  74 public:
  75     Lexer(VM*);
  76     ~Lexer();
  77
  78     // Character manipulation functions.
  79     static bool isWhiteSpace(T character);
  80     static bool isLineTerminator(T character);
  81     static unsigned char convertHex(int c1, int c2);
  82     static UChar convertUnicode(int c1, int c2, int c3, int c4);
  83
  84     // Functions to set up parsing.
  85     void setCode(const SourceCode&, ParserArena*);
  86     void setIsReparsing() { m_isReparsing = true; }
  87     bool isReparsing() const { return m_isReparsing; }
  88
  89     JSTokenType lex(JSTokenData*, JSTokenLocation*, unsigned, bool strictMode);
  90     bool nextTokenIsColon();
  91     int lineNumber() const { return m_lineNumber; }
  92     ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
  93     ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
  94     void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
  95     int lastLineNumber() const { return m_lastLineNumber; }
  96     bool prevTerminator() const { return m_terminator; }
  97     SourceCode sourceCode(int openBrace, int closeBrace, int firstLine, unsigned startColumn);
  98     bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
  99     bool skipRegExp();
 100
 101     // Functions for use after parsing.
 102     bool sawError() const { return m_error; }
 103     String getErrorMessage() const { return m_lexErrorMessage; }
 104     void clear();
 105     void setOffset(int offset, int lineStartOffset)
 106     {
 107         m_error = 0;
 108         m_lexErrorMessage = String();
 109
 110         m_code = sourcePtrFromOffset(offset);
 111         m_lineStart = sourcePtrFromOffset(lineStartOffset);
 112         ASSERT(currentOffset() >= currentLineStartOffset());
 113
 114         m_buffer8.resize(0);
 115         m_buffer16.resize(0);
 116         if (LIKELY(m_code < m_codeEnd))
 117             m_current = *m_code;
 118         else
 119             m_current = 0;
 120     }
 121     void setLineNumber(int line)
 122     {
 123         m_lineNumber = line;
 124     }
 125
 126     SourceProvider* sourceProvider() const { return m_source->provider(); }
 127
 128     JSTokenType lexExpectIdentifier(JSTokenData*, JSTokenLocation*, unsigned, bool strictMode);
 129
 130 private:
 131     void record8(int);
 132     void append8(const T*, size_t);
 133     void record16(int);
 134     void record16(T);
 135     void append16(const LChar*, size_t);
 136     void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
 137
 138     ALWAYS_INLINE void shift();
 139     ALWAYS_INLINE bool atEnd() const;
 140     ALWAYS_INLINE T peek(int offset) const;
 141     struct UnicodeHexValue {
 142
 143         enum ValueType { ValidHex, IncompleteHex, InvalidHex };
 144
 145         explicit UnicodeHexValue(int value)
 146             : m_value(value)
 147         {
 148         }
 149         explicit UnicodeHexValue(ValueType type)
 150             : m_value(type == IncompleteHex ? -2 : -1)
 151         {
 152         }
 153
 154         ValueType valueType() const
 155         {
 156             if (m_value >= 0)
 157                 return ValidHex;
 158             return m_value == -2 ? IncompleteHex : InvalidHex;
 159         }
 160         bool isValid() const { return m_value >= 0; }
 161         int value() const
 162         {
 163             ASSERT(m_value >= 0);
 164             return m_value;
 165         }
 166
 167     private:
 168         int m_value;
 169     };
 170     UnicodeHexValue parseFourDigitUnicodeHex();
 171     void shiftLineTerminator();
 172
 173     ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
 174     ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
 175
 176     String invalidCharacterMessage() const;
 177     ALWAYS_INLINE const T* currentSourcePtr() const;
 178     ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
 179
 180     ALWAYS_INLINE void setCodeStart(const StringImpl*);
 181
 182     ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
 183     ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
 184     ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
 185     ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
 186     ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
 187     ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
 188
 189     ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
 190
 191     template <int shiftAmount> void internalShift();
 192     template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
 193     template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
 194     template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
 195     enum StringParseResult {
 196         StringParsedSuccessfully,
 197         StringUnterminated,
 198         StringCannotBeParsed
 199     };
 200     template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode);
 201     template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode);
 202     ALWAYS_INLINE void parseHex(double& returnValue);
 203     ALWAYS_INLINE bool parseOctal(double& returnValue);
 204     ALWAYS_INLINE bool parseDecimal(double& returnValue);
 205     ALWAYS_INLINE void parseNumberAfterDecimalPoint();
 206     ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
 207     ALWAYS_INLINE bool parseMultilineComment();
 208
 209     static const size_t initialReadBufferCapacity = 32;
 210
 211     int m_lineNumber;
 212     int m_lastLineNumber;
 213
 214     Vector<LChar> m_buffer8;
 215     Vector<UChar> m_buffer16;
 216     bool m_terminator;
 217     int m_lastToken;
 218
 219     const SourceCode* m_source;
 220     unsigned m_sourceOffset;
 221     const T* m_code;
 222     const T* m_codeStart;
 223     const T* m_codeEnd;
 224     const T* m_codeStartPlusOffset;
 225     const T* m_lineStart;
 226     bool m_isReparsing;
 227     bool m_atLineStart;
 228     bool m_error;
 229     String m_lexErrorMessage;
 230
 231     T m_current;
 232
 233     IdentifierArena* m_arena;
 234
 235     VM* m_vm;
 236 };
 237
 238 template <>
 239 ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
 240 {
 241     return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
 242 }
 243
 244 template <>
 245 ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
 246 {
 247     return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (WTF::Unicode::isSeparatorSpace(ch) || ch == 0xFEFF);
 248 }
 249
 250 template <>
 251 ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
 252 {
 253     return ch == '\r' || ch == '\n';
 254 }
 255
 256 template <>
 257 ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
 258 {
 259     return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
 260 }
 261
 262 template <typename T>
 263 inline unsigned char Lexer<T>::convertHex(int c1, int c2)
 264 {
 265     return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
 266 }
 267
 268 template <typename T>
 269 inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
 270 {
 271     return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
 272 }
 273
 274 template <typename T>
 275 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
 276 {
 277     return &m_arena->makeIdentifier(m_vm, characters, length);
 278 }
 279
 280 template <typename T>
 281 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
 282 {
 283     return &m_arena->makeIdentifier(m_vm, characters, length);
 284 }
 285
 286 template <>
 287 ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
 288 {
 289     return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 290 }
 291
 292 template <>
 293 ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
 294 {
 295     if (!(orAllChars & ~0xff))
 296         return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 297
 298     return &m_arena->makeIdentifier(m_vm, characters, length);
 299 }
 300
 301 template <>
 302 ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
 303 {
 304     ASSERT(sourceString->is8Bit());
 305     m_codeStart = sourceString->characters8();
 306 }
 307
 308 template <>
 309 ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
 310 {
 311     ASSERT(!sourceString->is8Bit());
 312     m_codeStart = sourceString->characters16();
 313 }
 314
 315 template <typename T>
 316 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
 317 {
 318     return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 319 }
 320
 321 template <typename T>
 322 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
 323 {
 324     return &m_arena->makeIdentifier(m_vm, characters, length);
 325 }
 326
 327 template <typename T>
 328 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
 329 {
 330     return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 331 }
 332
 333 template <typename T>
 334 ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSTokenData* tokenData, JSTokenLocation* tokenLocation, unsigned lexerFlags, bool strictMode)
 335 {
 336     ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
 337     const T* start = m_code;
 338     const T* ptr = start;
 339     const T* end = m_codeEnd;
 340     if (ptr >= end) {
 341         ASSERT(ptr == end);
 342         goto slowCase;
 343     }
 344     if (!WTF::isASCIIAlpha(*ptr))
 345         goto slowCase;
 346     ++ptr;
 347     while (ptr < end) {
 348         if (!WTF::isASCIIAlphanumeric(*ptr))
 349             break;
 350         ++ptr;
 351     }
 352
 353     // Here's the shift
 354     if (ptr < end) {
 355         if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
 356             goto slowCase;
 357         m_current = *ptr;
 358     } else
 359         m_current = 0;
 360
 361     m_code = ptr;
 362     ASSERT(currentOffset() >= currentLineStartOffset());
 363
 364     // Create the identifier if needed
 365     if (lexerFlags & LexexFlagsDontBuildKeywords)
 366         tokenData->ident = 0;
 367     else
 368         tokenData->ident = makeLCharIdentifier(start, ptr - start);
 369     tokenLocation->line = m_lineNumber;
 370     tokenLocation->lineStartOffset = currentLineStartOffset();
 371     tokenLocation->startOffset = offsetFromSourcePtr(start);
 372     tokenLocation->endOffset = currentOffset();
 373     ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
 374     m_lastToken = IDENT;
 375     return IDENT;
 376
 377 slowCase:
 378     return lex(tokenData, tokenLocation, lexerFlags, strictMode);
 379 }
 380
 381 } // namespace JSC
 382
 383 #endif // Lexer_h