parser/Lexer.h

   1 /*
   2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   3  *  Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved.
   4  *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
   5  *
   6  *  This library is free software; you can redistribute it and/or
   7  *  modify it under the terms of the GNU Library General Public
   8  *  License as published by the Free Software Foundation; either
   9  *  version 2 of the License, or (at your option) any later version.
  10  *
  11  *  This library is distributed in the hope that it will be useful,
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  *  Library General Public License for more details.
  15  *
  16  *  You should have received a copy of the GNU Library General Public License
  17  *  along with this library; see the file COPYING.LIB.  If not, write to
  18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19  *  Boston, MA 02110-1301, USA.
  20  *
  21  */
  22
  23 #ifndef Lexer_h
  24 #define Lexer_h
  25
  26 #include "Lookup.h"
  27 #include "ParserArena.h"
  28 #include "ParserTokens.h"
  29 #include "SourceCode.h"
  30 #include <wtf/ASCIICType.h>
  31 #include <wtf/SegmentedVector.h>
  32 #include <wtf/Vector.h>
  33
  34 namespace JSC {
  35
  36 class Keywords {
  37 public:
  38     bool isKeyword(const Identifier& ident) const
  39     {
  40         return m_keywordTable.entry(m_vm, ident);
  41     }
  42
  43     const HashTableValue* getKeyword(const Identifier& ident) const
  44     {
  45         return m_keywordTable.entry(m_vm, ident);
  46     }
  47
  48     ~Keywords()
  49     {
  50         m_keywordTable.deleteTable();
  51     }
  52
  53 private:
  54     friend class VM;
  55
  56     explicit Keywords(VM&);
  57
  58     VM& m_vm;
  59     const HashTable m_keywordTable;
  60 };
  61
  62 enum LexerFlags {
  63     LexerFlagsIgnoreReservedWords = 1,
  64     LexerFlagsDontBuildStrings = 2,
  65     LexexFlagsDontBuildKeywords = 4
  66 };
  67
  68 template <typename T>
  69 class Lexer {
  70     WTF_MAKE_NONCOPYABLE(Lexer);
  71     WTF_MAKE_FAST_ALLOCATED;
  72
  73 public:
  74     Lexer(VM*, JSParserStrictness);
  75     ~Lexer();
  76
  77     // Character manipulation functions.
  78     static bool isWhiteSpace(T character);
  79     static bool isLineTerminator(T character);
  80     static unsigned char convertHex(int c1, int c2);
  81     static UChar convertUnicode(int c1, int c2, int c3, int c4);
  82
  83     // Functions to set up parsing.
  84     void setCode(const SourceCode&, ParserArena*);
  85     void setIsReparsing() { m_isReparsing = true; }
  86     bool isReparsing() const { return m_isReparsing; }
  87
  88     JSTokenType lex(JSToken*, unsigned, bool strictMode);
  89     bool nextTokenIsColon();
  90     int lineNumber() const { return m_lineNumber; }
  91     ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
  92     ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
  93     ALWAYS_INLINE JSTextPosition currentPosition() const
  94     {
  95         return JSTextPosition(m_lineNumber, currentOffset(), currentLineStartOffset());
  96     }
  97     JSTextPosition positionBeforeLastNewline() const { return m_positionBeforeLastNewline; }
  98     void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
  99     int lastLineNumber() const { return m_lastLineNumber; }
 100     bool prevTerminator() const { return m_terminator; }
 101     bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
 102     bool skipRegExp();
 103
 104     // Functions for use after parsing.
 105     bool sawError() const { return m_error; }
 106     String getErrorMessage() const { return m_lexErrorMessage; }
 107     void clear();
 108     void setOffset(int offset, int lineStartOffset)
 109     {
 110         m_error = 0;
 111         m_lexErrorMessage = String();
 112
 113         m_code = sourcePtrFromOffset(offset);
 114         m_lineStart = sourcePtrFromOffset(lineStartOffset);
 115         ASSERT(currentOffset() >= currentLineStartOffset());
 116
 117         m_buffer8.resize(0);
 118         m_buffer16.resize(0);
 119         if (LIKELY(m_code < m_codeEnd))
 120             m_current = *m_code;
 121         else
 122             m_current = 0;
 123     }
 124     void setLineNumber(int line)
 125     {
 126         m_lineNumber = line;
 127     }
 128
 129     SourceProvider* sourceProvider() const { return m_source->provider(); }
 130
 131     JSTokenType lexExpectIdentifier(JSToken*, unsigned, bool strictMode);
 132
 133 private:
 134     void record8(int);
 135     void append8(const T*, size_t);
 136     void record16(int);
 137     void record16(T);
 138     void append16(const LChar*, size_t);
 139     void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
 140
 141     ALWAYS_INLINE void shift();
 142     ALWAYS_INLINE bool atEnd() const;
 143     ALWAYS_INLINE T peek(int offset) const;
 144     struct UnicodeHexValue {
 145
 146         enum ValueType { ValidHex, IncompleteHex, InvalidHex };
 147
 148         explicit UnicodeHexValue(int value)
 149             : m_value(value)
 150         {
 151         }
 152         explicit UnicodeHexValue(ValueType type)
 153             : m_value(type == IncompleteHex ? -2 : -1)
 154         {
 155         }
 156
 157         ValueType valueType() const
 158         {
 159             if (m_value >= 0)
 160                 return ValidHex;
 161             return m_value == -2 ? IncompleteHex : InvalidHex;
 162         }
 163         bool isValid() const { return m_value >= 0; }
 164         int value() const
 165         {
 166             ASSERT(m_value >= 0);
 167             return m_value;
 168         }
 169
 170     private:
 171         int m_value;
 172     };
 173     UnicodeHexValue parseFourDigitUnicodeHex();
 174     void shiftLineTerminator();
 175
 176     ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
 177     ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
 178
 179     String invalidCharacterMessage() const;
 180     ALWAYS_INLINE const T* currentSourcePtr() const;
 181     ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
 182
 183     ALWAYS_INLINE void setCodeStart(const StringImpl*);
 184
 185     ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
 186     ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
 187     ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
 188     ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
 189     ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
 190     ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
 191
 192     ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
 193
 194     template <int shiftAmount> void internalShift();
 195     template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
 196     template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
 197     template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
 198     enum StringParseResult {
 199         StringParsedSuccessfully,
 200         StringUnterminated,
 201         StringCannotBeParsed
 202     };
 203     template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode);
 204     template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode);
 205     ALWAYS_INLINE void parseHex(double& returnValue);
 206     ALWAYS_INLINE bool parseOctal(double& returnValue);
 207     ALWAYS_INLINE bool parseDecimal(double& returnValue);
 208     ALWAYS_INLINE void parseNumberAfterDecimalPoint();
 209     ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
 210     ALWAYS_INLINE bool parseMultilineComment();
 211
 212     static const size_t initialReadBufferCapacity = 32;
 213
 214     int m_lineNumber;
 215     int m_lastLineNumber;
 216
 217     Vector<LChar> m_buffer8;
 218     Vector<UChar> m_buffer16;
 219     bool m_terminator;
 220     int m_lastToken;
 221
 222     const SourceCode* m_source;
 223     unsigned m_sourceOffset;
 224     const T* m_code;
 225     const T* m_codeStart;
 226     const T* m_codeEnd;
 227     const T* m_codeStartPlusOffset;
 228     const T* m_lineStart;
 229     JSTextPosition m_positionBeforeLastNewline;
 230     bool m_isReparsing;
 231     bool m_atLineStart;
 232     bool m_error;
 233     String m_lexErrorMessage;
 234
 235     T m_current;
 236
 237     IdentifierArena* m_arena;
 238
 239     VM* m_vm;
 240     bool m_parsingBuiltinFunction;
 241 };
 242
 243 template <>
 244 ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
 245 {
 246     return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
 247 }
 248
 249 template <>
 250 ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
 251 {
 252     // 0x180E used to be in Zs category before Unicode 6.3, and EcmaScript says that we should keep treating it as such.
 253     return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (u_charType(ch) == U_SPACE_SEPARATOR || ch == 0x180E || ch == 0xFEFF);
 254 }
 255
 256 template <>
 257 ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
 258 {
 259     return ch == '\r' || ch == '\n';
 260 }
 261
 262 template <>
 263 ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
 264 {
 265     return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
 266 }
 267
 268 template <typename T>
 269 inline unsigned char Lexer<T>::convertHex(int c1, int c2)
 270 {
 271     return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
 272 }
 273
 274 template <typename T>
 275 inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
 276 {
 277     return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
 278 }
 279
 280 template <typename T>
 281 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
 282 {
 283     return &m_arena->makeIdentifier(m_vm, characters, length);
 284 }
 285
 286 template <typename T>
 287 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
 288 {
 289     return &m_arena->makeIdentifier(m_vm, characters, length);
 290 }
 291
 292 template <>
 293 ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
 294 {
 295     return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 296 }
 297
 298 template <>
 299 ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
 300 {
 301     if (!(orAllChars & ~0xff))
 302         return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 303
 304     return &m_arena->makeIdentifier(m_vm, characters, length);
 305 }
 306
 307 template <>
 308 ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
 309 {
 310     ASSERT(sourceString->is8Bit());
 311     m_codeStart = sourceString->characters8();
 312 }
 313
 314 template <>
 315 ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
 316 {
 317     ASSERT(!sourceString->is8Bit());
 318     m_codeStart = sourceString->characters16();
 319 }
 320
 321 template <typename T>
 322 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
 323 {
 324     return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 325 }
 326
 327 template <typename T>
 328 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
 329 {
 330     return &m_arena->makeIdentifier(m_vm, characters, length);
 331 }
 332
 333 template <typename T>
 334 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
 335 {
 336     return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
 337 }
 338
 339 #if ASSERT_DISABLED
 340 ALWAYS_INLINE bool isSafeBuiltinIdentifier(VM&, const Identifier*) { return true; }
 341 #else
 342 bool isSafeBuiltinIdentifier(VM&, const Identifier*);
 343 #endif
 344
 345 template <typename T>
 346 ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
 347 {
 348     JSTokenData* tokenData = &tokenRecord->m_data;
 349     JSTokenLocation* tokenLocation = &tokenRecord->m_location;
 350     ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
 351     const T* start = m_code;
 352     const T* ptr = start;
 353     const T* end = m_codeEnd;
 354     JSTextPosition startPosition = currentPosition();
 355     if (ptr >= end) {
 356         ASSERT(ptr == end);
 357         goto slowCase;
 358     }
 359     if (!WTF::isASCIIAlpha(*ptr))
 360         goto slowCase;
 361     ++ptr;
 362     while (ptr < end) {
 363         if (!WTF::isASCIIAlphanumeric(*ptr))
 364             break;
 365         ++ptr;
 366     }
 367
 368     // Here's the shift
 369     if (ptr < end) {
 370         if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
 371             goto slowCase;
 372         m_current = *ptr;
 373     } else
 374         m_current = 0;
 375
 376     m_code = ptr;
 377     ASSERT(currentOffset() >= currentLineStartOffset());
 378
 379     // Create the identifier if needed
 380     if (lexerFlags & LexexFlagsDontBuildKeywords
 381 #if !ASSERT_DISABLED
 382         && !m_parsingBuiltinFunction
 383 #endif
 384         )
 385         tokenData->ident = 0;
 386     else
 387         tokenData->ident = makeLCharIdentifier(start, ptr - start);
 388
 389     tokenLocation->line = m_lineNumber;
 390     tokenLocation->lineStartOffset = currentLineStartOffset();
 391     tokenLocation->startOffset = offsetFromSourcePtr(start);
 392     tokenLocation->endOffset = currentOffset();
 393     ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
 394     tokenRecord->m_startPosition = startPosition;
 395     tokenRecord->m_endPosition = currentPosition();
 396 #if !ASSERT_DISABLED
 397     if (m_parsingBuiltinFunction) {
 398         if (!isSafeBuiltinIdentifier(*m_vm, tokenData->ident))
 399             return ERRORTOK;
 400     }
 401 #endif
 402
 403     m_lastToken = IDENT;
 404     return IDENT;
 405
 406 slowCase:
 407     return lex(tokenRecord, lexerFlags, strictMode);
 408 }
 409
 410 } // namespace JSC
 411
 412 #endif // Lexer_h