parser/Lexer.h

   1 /*
   2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   3  *  Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012 Apple Inc. All rights reserved.
   4  *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
   5  *
   6  *  This library is free software; you can redistribute it and/or
   7  *  modify it under the terms of the GNU Library General Public
   8  *  License as published by the Free Software Foundation; either
   9  *  version 2 of the License, or (at your option) any later version.
  10  *
  11  *  This library is distributed in the hope that it will be useful,
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  *  Library General Public License for more details.
  15  *
  16  *  You should have received a copy of the GNU Library General Public License
  17  *  along with this library; see the file COPYING.LIB.  If not, write to
  18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19  *  Boston, MA 02110-1301, USA.
  20  *
  21  */
  22
  23 #ifndef Lexer_h
  24 #define Lexer_h
  25
  26 #include "Lookup.h"
  27 #include "ParserArena.h"
  28 #include "ParserTokens.h"
  29 #include "SourceCode.h"
  30 #include <wtf/ASCIICType.h>
  31 #include <wtf/AlwaysInline.h>
  32 #include <wtf/SegmentedVector.h>
  33 #include <wtf/Vector.h>
  34 #include <wtf/unicode/Unicode.h>
  35
  36 namespace JSC {
  37
  38 class Keywords {
  39 public:
  40     bool isKeyword(const Identifier& ident) const
  41     {
  42         return m_keywordTable.entry(m_globalData, ident);
  43     }
  44
  45     const HashEntry* getKeyword(const Identifier& ident) const
  46     {
  47         return m_keywordTable.entry(m_globalData, ident);
  48     }
  49
  50     ~Keywords()
  51     {
  52         m_keywordTable.deleteTable();
  53     }
  54
  55 private:
  56     friend class JSGlobalData;
  57
  58     Keywords(JSGlobalData*);
  59
  60     JSGlobalData* m_globalData;
  61     const HashTable m_keywordTable;
  62 };
  63
  64 enum LexerFlags {
  65     LexerFlagsIgnoreReservedWords = 1,
  66     LexerFlagsDontBuildStrings = 2,
  67     LexexFlagsDontBuildKeywords = 4
  68 };
  69
  70 template <typename T>
  71 class Lexer {
  72     WTF_MAKE_NONCOPYABLE(Lexer);
  73     WTF_MAKE_FAST_ALLOCATED;
  74
  75 public:
  76     Lexer(JSGlobalData*);
  77     ~Lexer();
  78
  79     // Character manipulation functions.
  80     static bool isWhiteSpace(T character);
  81     static bool isLineTerminator(T character);
  82     static unsigned char convertHex(int c1, int c2);
  83     static UChar convertUnicode(int c1, int c2, int c3, int c4);
  84
  85     // Functions to set up parsing.
  86     void setCode(const SourceCode&, ParserArena*);
  87     void setIsReparsing() { m_isReparsing = true; }
  88     bool isReparsing() const { return m_isReparsing; }
  89
  90     JSTokenType lex(JSTokenData*, JSTokenInfo*, unsigned, bool strictMode);
  91     bool nextTokenIsColon();
  92     int lineNumber() const { return m_lineNumber; }
  93     void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
  94     int lastLineNumber() const { return m_lastLineNumber; }
  95     bool prevTerminator() const { return m_terminator; }
  96     SourceCode sourceCode(int openBrace, int closeBrace, int firstLine);
  97     bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
  98     bool skipRegExp();
  99
 100     // Functions for use after parsing.
 101     bool sawError() const { return m_error; }
 102     UString getErrorMessage() const { return m_lexErrorMessage; }
 103     void clear();
 104     void setOffset(int offset)
 105     {
 106         m_error = 0;
 107         m_lexErrorMessage = UString();
 108         m_code = m_codeStart + offset;
 109         m_buffer8.resize(0);
 110         m_buffer16.resize(0);
 111         if (LIKELY(m_code < m_codeEnd))
 112             m_current = *m_code;
 113         else
 114             m_current = 0;
 115     }
 116     void setLineNumber(int line)
 117     {
 118         m_lineNumber = line;
 119     }
 120
 121     SourceProvider* sourceProvider() const { return m_source->provider(); }
 122
 123     JSTokenType lexExpectIdentifier(JSTokenData*, JSTokenInfo*, unsigned, bool strictMode);
 124
 125 private:
 126     void record8(int);
 127     void append8(const T*, size_t);
 128     void record16(int);
 129     void record16(T);
 130     void append16(const LChar*, size_t);
 131     void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
 132
 133     ALWAYS_INLINE void shift();
 134     ALWAYS_INLINE bool atEnd() const;
 135     ALWAYS_INLINE T peek(int offset) const;
 136     int parseFourDigitUnicodeHex();
 137     void shiftLineTerminator();
 138
 139     UString invalidCharacterMessage() const;
 140     ALWAYS_INLINE const T* currentCharacter() const;
 141     ALWAYS_INLINE int currentOffset() const { return m_code - m_codeStart; }
 142     ALWAYS_INLINE void setOffsetFromCharOffset(const T* charOffset) { setOffset(charOffset - m_codeStart); }
 143
 144     ALWAYS_INLINE void setCodeStart(const StringImpl*);
 145
 146     ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
 147     ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
 148     ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
 149
 150     ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
 151
 152     template <int shiftAmount> void internalShift();
 153     template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
 154     template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
 155     template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
 156     template <bool shouldBuildStrings> ALWAYS_INLINE bool parseString(JSTokenData*, bool strictMode);
 157     template <bool shouldBuildStrings> NEVER_INLINE bool parseStringSlowCase(JSTokenData*, bool strictMode);
 158     ALWAYS_INLINE void parseHex(double& returnValue);
 159     ALWAYS_INLINE bool parseOctal(double& returnValue);
 160     ALWAYS_INLINE bool parseDecimal(double& returnValue);
 161     ALWAYS_INLINE void parseNumberAfterDecimalPoint();
 162     ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
 163     ALWAYS_INLINE bool parseMultilineComment();
 164
 165     static const size_t initialReadBufferCapacity = 32;
 166
 167     int m_lineNumber;
 168     int m_lastLineNumber;
 169
 170     Vector<LChar> m_buffer8;
 171     Vector<UChar> m_buffer16;
 172     bool m_terminator;
 173     int m_lastToken;
 174
 175     const SourceCode* m_source;
 176     const T* m_code;
 177     const T* m_codeStart;
 178     const T* m_codeEnd;
 179     bool m_isReparsing;
 180     bool m_atLineStart;
 181     bool m_error;
 182     UString m_lexErrorMessage;
 183
 184     T m_current;
 185
 186     IdentifierArena* m_arena;
 187
 188     JSGlobalData* m_globalData;
 189 };
 190
 191 template <>
 192 ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
 193 {
 194     return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
 195 }
 196
 197 template <>
 198 ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
 199 {
 200     return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (WTF::Unicode::isSeparatorSpace(ch) || ch == 0xFEFF);
 201 }
 202
 203 template <>
 204 ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
 205 {
 206     return ch == '\r' || ch == '\n';
 207 }
 208
 209 template <>
 210 ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
 211 {
 212     return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
 213 }
 214
 215 template <typename T>
 216 inline unsigned char Lexer<T>::convertHex(int c1, int c2)
 217 {
 218     return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
 219 }
 220
 221 template <typename T>
 222 inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
 223 {
 224     return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
 225 }
 226
 227 template <typename T>
 228 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
 229 {
 230     return &m_arena->makeIdentifier(m_globalData, characters, length);
 231 }
 232
 233 template <typename T>
 234 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
 235 {
 236     return &m_arena->makeIdentifier(m_globalData, characters, length);
 237 }
 238
 239 template <>
 240 ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
 241 {
 242     ASSERT(sourceString->is8Bit());
 243     m_codeStart = sourceString->characters8();
 244 }
 245
 246 template <>
 247 ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
 248 {
 249     ASSERT(!sourceString->is8Bit());
 250     m_codeStart = sourceString->characters16();
 251 }
 252
 253 template <typename T>
 254 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
 255 {
 256     return &m_arena->makeIdentifierLCharFromUChar(m_globalData, characters, length);
 257 }
 258
 259 template <typename T>
 260 ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSTokenData* tokenData, JSTokenInfo* tokenInfo, unsigned lexerFlags, bool strictMode)
 261 {
 262     ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
 263     const T* start = m_code;
 264     const T* ptr = start;
 265     const T* end = m_codeEnd;
 266     if (ptr >= end) {
 267         ASSERT(ptr == end);
 268         goto slowCase;
 269     }
 270     if (!WTF::isASCIIAlpha(*ptr))
 271         goto slowCase;
 272     ++ptr;
 273     while (ptr < end) {
 274         if (!WTF::isASCIIAlphanumeric(*ptr))
 275             break;
 276         ++ptr;
 277     }
 278
 279     // Here's the shift
 280     if (ptr < end) {
 281         if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
 282             goto slowCase;
 283         m_current = *ptr;
 284     } else
 285         m_current = 0;
 286
 287     m_code = ptr;
 288
 289     // Create the identifier if needed
 290     if (lexerFlags & LexexFlagsDontBuildKeywords)
 291         tokenData->ident = 0;
 292     else
 293         tokenData->ident = makeIdentifier(start, ptr - start);
 294     tokenInfo->line = m_lineNumber;
 295     tokenInfo->startOffset = start - m_codeStart;
 296     tokenInfo->endOffset = currentOffset();
 297     m_lastToken = IDENT;
 298     return IDENT;
 299
 300 slowCase:
 301     return lex(tokenData, tokenInfo, lexerFlags, strictMode);
 302 }
 303
 304 } // namespace JSC
 305
 306 #endif // Lexer_h