parser/Lexer.cpp

   1 /*
   2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   3  *  Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
   4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
   5  *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
   6  *
   7  *  This library is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU Library General Public
   9  *  License as published by the Free Software Foundation; either
  10  *  version 2 of the License, or (at your option) any later version.
  11  *
  12  *  This library is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  Library General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Library General Public License
  18  *  along with this library; see the file COPYING.LIB.  If not, write to
  19  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  20  *  Boston, MA 02110-1301, USA.
  21  *
  22  */
  23
  24 #include "config.h"
  25 #include "Lexer.h"
  26
  27 #include "JSFunction.h"
  28
  29 #include "JSGlobalObjectFunctions.h"
  30 #include "Identifier.h"
  31 #include "NodeInfo.h"
  32 #include "Nodes.h"
  33 #include "dtoa.h"
  34 #include <ctype.h>
  35 #include <limits.h>
  36 #include <string.h>
  37 #include <wtf/Assertions.h>
  38
  39 using namespace WTF;
  40 using namespace Unicode;
  41
  42 #include "JSParser.h"
  43 #include "KeywordLookup.h"
  44 #include "Lookup.h"
  45 #include "Lexer.lut.h"
  46
  47 namespace JSC {
  48
  49
  50 enum CharacterType {
  51     // Types for the main switch
  52
  53     // The first three types are fixed, and also used for identifying
  54     // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
  55     CharacterIdentifierStart,
  56     CharacterZero,
  57     CharacterNumber,
  58
  59     CharacterInvalid,
  60     CharacterLineTerminator,
  61     CharacterExclamationMark,
  62     CharacterOpenParen,
  63     CharacterCloseParen,
  64     CharacterOpenBracket,
  65     CharacterCloseBracket,
  66     CharacterComma,
  67     CharacterColon,
  68     CharacterQuestion,
  69     CharacterTilde,
  70     CharacterQuote,
  71     CharacterDot,
  72     CharacterSlash,
  73     CharacterBackSlash,
  74     CharacterSemicolon,
  75     CharacterOpenBrace,
  76     CharacterCloseBrace,
  77
  78     CharacterAdd,
  79     CharacterSub,
  80     CharacterMultiply,
  81     CharacterModulo,
  82     CharacterAnd,
  83     CharacterXor,
  84     CharacterOr,
  85     CharacterLess,
  86     CharacterGreater,
  87     CharacterEqual,
  88
  89     // Other types (only one so far)
  90     CharacterWhiteSpace,
  91 };
  92
  93 // 128 ASCII codes
  94 static const unsigned short typesOfASCIICharacters[128] = {
  95 /*   0 - Null               */ CharacterInvalid,
  96 /*   1 - Start of Heading   */ CharacterInvalid,
  97 /*   2 - Start of Text      */ CharacterInvalid,
  98 /*   3 - End of Text        */ CharacterInvalid,
  99 /*   4 - End of Transm.     */ CharacterInvalid,
 100 /*   5 - Enquiry            */ CharacterInvalid,
 101 /*   6 - Acknowledgment     */ CharacterInvalid,
 102 /*   7 - Bell               */ CharacterInvalid,
 103 /*   8 - Back Space         */ CharacterInvalid,
 104 /*   9 - Horizontal Tab     */ CharacterWhiteSpace,
 105 /*  10 - Line Feed          */ CharacterLineTerminator,
 106 /*  11 - Vertical Tab       */ CharacterWhiteSpace,
 107 /*  12 - Form Feed          */ CharacterWhiteSpace,
 108 /*  13 - Carriage Return    */ CharacterLineTerminator,
 109 /*  14 - Shift Out          */ CharacterInvalid,
 110 /*  15 - Shift In           */ CharacterInvalid,
 111 /*  16 - Data Line Escape   */ CharacterInvalid,
 112 /*  17 - Device Control 1   */ CharacterInvalid,
 113 /*  18 - Device Control 2   */ CharacterInvalid,
 114 /*  19 - Device Control 3   */ CharacterInvalid,
 115 /*  20 - Device Control 4   */ CharacterInvalid,
 116 /*  21 - Negative Ack.      */ CharacterInvalid,
 117 /*  22 - Synchronous Idle   */ CharacterInvalid,
 118 /*  23 - End of Transmit    */ CharacterInvalid,
 119 /*  24 - Cancel             */ CharacterInvalid,
 120 /*  25 - End of Medium      */ CharacterInvalid,
 121 /*  26 - Substitute         */ CharacterInvalid,
 122 /*  27 - Escape             */ CharacterInvalid,
 123 /*  28 - File Separator     */ CharacterInvalid,
 124 /*  29 - Group Separator    */ CharacterInvalid,
 125 /*  30 - Record Separator   */ CharacterInvalid,
 126 /*  31 - Unit Separator     */ CharacterInvalid,
 127 /*  32 - Space              */ CharacterWhiteSpace,
 128 /*  33 - !                  */ CharacterExclamationMark,
 129 /*  34 - "                  */ CharacterQuote,
 130 /*  35 - #                  */ CharacterInvalid,
 131 /*  36 - $                  */ CharacterIdentifierStart,
 132 /*  37 - %                  */ CharacterModulo,
 133 /*  38 - &                  */ CharacterAnd,
 134 /*  39 - '                  */ CharacterQuote,
 135 /*  40 - (                  */ CharacterOpenParen,
 136 /*  41 - )                  */ CharacterCloseParen,
 137 /*  42 - *                  */ CharacterMultiply,
 138 /*  43 - +                  */ CharacterAdd,
 139 /*  44 - ,                  */ CharacterComma,
 140 /*  45 - -                  */ CharacterSub,
 141 /*  46 - .                  */ CharacterDot,
 142 /*  47 - /                  */ CharacterSlash,
 143 /*  48 - 0                  */ CharacterZero,
 144 /*  49 - 1                  */ CharacterNumber,
 145 /*  50 - 2                  */ CharacterNumber,
 146 /*  51 - 3                  */ CharacterNumber,
 147 /*  52 - 4                  */ CharacterNumber,
 148 /*  53 - 5                  */ CharacterNumber,
 149 /*  54 - 6                  */ CharacterNumber,
 150 /*  55 - 7                  */ CharacterNumber,
 151 /*  56 - 8                  */ CharacterNumber,
 152 /*  57 - 9                  */ CharacterNumber,
 153 /*  58 - :                  */ CharacterColon,
 154 /*  59 - ;                  */ CharacterSemicolon,
 155 /*  60 - <                  */ CharacterLess,
 156 /*  61 - =                  */ CharacterEqual,
 157 /*  62 - >                  */ CharacterGreater,
 158 /*  63 - ?                  */ CharacterQuestion,
 159 /*  64 - @                  */ CharacterInvalid,
 160 /*  65 - A                  */ CharacterIdentifierStart,
 161 /*  66 - B                  */ CharacterIdentifierStart,
 162 /*  67 - C                  */ CharacterIdentifierStart,
 163 /*  68 - D                  */ CharacterIdentifierStart,
 164 /*  69 - E                  */ CharacterIdentifierStart,
 165 /*  70 - F                  */ CharacterIdentifierStart,
 166 /*  71 - G                  */ CharacterIdentifierStart,
 167 /*  72 - H                  */ CharacterIdentifierStart,
 168 /*  73 - I                  */ CharacterIdentifierStart,
 169 /*  74 - J                  */ CharacterIdentifierStart,
 170 /*  75 - K                  */ CharacterIdentifierStart,
 171 /*  76 - L                  */ CharacterIdentifierStart,
 172 /*  77 - M                  */ CharacterIdentifierStart,
 173 /*  78 - N                  */ CharacterIdentifierStart,
 174 /*  79 - O                  */ CharacterIdentifierStart,
 175 /*  80 - P                  */ CharacterIdentifierStart,
 176 /*  81 - Q                  */ CharacterIdentifierStart,
 177 /*  82 - R                  */ CharacterIdentifierStart,
 178 /*  83 - S                  */ CharacterIdentifierStart,
 179 /*  84 - T                  */ CharacterIdentifierStart,
 180 /*  85 - U                  */ CharacterIdentifierStart,
 181 /*  86 - V                  */ CharacterIdentifierStart,
 182 /*  87 - W                  */ CharacterIdentifierStart,
 183 /*  88 - X                  */ CharacterIdentifierStart,
 184 /*  89 - Y                  */ CharacterIdentifierStart,
 185 /*  90 - Z                  */ CharacterIdentifierStart,
 186 /*  91 - [                  */ CharacterOpenBracket,
 187 /*  92 - \                  */ CharacterBackSlash,
 188 /*  93 - ]                  */ CharacterCloseBracket,
 189 /*  94 - ^                  */ CharacterXor,
 190 /*  95 - _                  */ CharacterIdentifierStart,
 191 /*  96 - `                  */ CharacterInvalid,
 192 /*  97 - a                  */ CharacterIdentifierStart,
 193 /*  98 - b                  */ CharacterIdentifierStart,
 194 /*  99 - c                  */ CharacterIdentifierStart,
 195 /* 100 - d                  */ CharacterIdentifierStart,
 196 /* 101 - e                  */ CharacterIdentifierStart,
 197 /* 102 - f                  */ CharacterIdentifierStart,
 198 /* 103 - g                  */ CharacterIdentifierStart,
 199 /* 104 - h                  */ CharacterIdentifierStart,
 200 /* 105 - i                  */ CharacterIdentifierStart,
 201 /* 106 - j                  */ CharacterIdentifierStart,
 202 /* 107 - k                  */ CharacterIdentifierStart,
 203 /* 108 - l                  */ CharacterIdentifierStart,
 204 /* 109 - m                  */ CharacterIdentifierStart,
 205 /* 110 - n                  */ CharacterIdentifierStart,
 206 /* 111 - o                  */ CharacterIdentifierStart,
 207 /* 112 - p                  */ CharacterIdentifierStart,
 208 /* 113 - q                  */ CharacterIdentifierStart,
 209 /* 114 - r                  */ CharacterIdentifierStart,
 210 /* 115 - s                  */ CharacterIdentifierStart,
 211 /* 116 - t                  */ CharacterIdentifierStart,
 212 /* 117 - u                  */ CharacterIdentifierStart,
 213 /* 118 - v                  */ CharacterIdentifierStart,
 214 /* 119 - w                  */ CharacterIdentifierStart,
 215 /* 120 - x                  */ CharacterIdentifierStart,
 216 /* 121 - y                  */ CharacterIdentifierStart,
 217 /* 122 - z                  */ CharacterIdentifierStart,
 218 /* 123 - {                  */ CharacterOpenBrace,
 219 /* 124 - |                  */ CharacterOr,
 220 /* 125 - }                  */ CharacterCloseBrace,
 221 /* 126 - ~                  */ CharacterTilde,
 222 /* 127 - Delete             */ CharacterInvalid,
 223 };
 224
 225 Lexer::Lexer(JSGlobalData* globalData)
 226     : m_isReparsing(false)
 227     , m_globalData(globalData)
 228     , m_keywordTable(JSC::mainTable)
 229 {
 230 }
 231
 232 Lexer::~Lexer()
 233 {
 234     m_keywordTable.deleteTable();
 235 }
 236
 237 ALWAYS_INLINE const UChar* Lexer::currentCharacter() const
 238 {
 239     ASSERT(m_code <= m_codeEnd);
 240     return m_code;
 241 }
 242
 243 ALWAYS_INLINE int Lexer::currentOffset() const
 244 {
 245     return currentCharacter() - m_codeStart;
 246 }
 247
 248 void Lexer::setCode(const SourceCode& source, ParserArena& arena)
 249 {
 250     m_arena = &arena.identifierArena();
 251
 252     m_lineNumber = source.firstLine();
 253     m_delimited = false;
 254     m_lastToken = -1;
 255
 256     const UChar* data = source.provider()->data();
 257
 258     m_source = &source;
 259     m_codeStart = data;
 260     m_code = data + source.startOffset();
 261     m_codeEnd = data + source.endOffset();
 262     m_error = false;
 263     m_atLineStart = true;
 264
 265     m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
 266     m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
 267
 268     if (LIKELY(m_code < m_codeEnd))
 269         m_current = *m_code;
 270     else
 271         m_current = -1;
 272     ASSERT(currentOffset() == source.startOffset());
 273 }
 274
 275 template <int shiftAmount, Lexer::ShiftType shouldBoundsCheck> ALWAYS_INLINE void Lexer::internalShift()
 276 {
 277     if (shouldBoundsCheck == DoBoundsCheck) {
 278         // Faster than an if-else sequence
 279         ASSERT(m_current != -1);
 280         m_current = -1;
 281         m_code += shiftAmount;
 282         if (LIKELY(m_code < m_codeEnd))
 283             m_current = *m_code;
 284     } else {
 285         m_code += shiftAmount;
 286         m_current = *m_code;
 287     }
 288 }
 289
 290 ALWAYS_INLINE void Lexer::shift()
 291 {
 292     internalShift<1, DoBoundsCheck>();
 293 }
 294
 295 ALWAYS_INLINE int Lexer::peek(int offset)
 296 {
 297     // Only use if necessary
 298     ASSERT(offset > 0 && offset < 5);
 299     const UChar* code = m_code + offset;
 300     return (code < m_codeEnd) ? *code : -1;
 301 }
 302
 303 int Lexer::getUnicodeCharacter()
 304 {
 305     int char1 = peek(1);
 306     int char2 = peek(2);
 307     int char3 = peek(3);
 308
 309     if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
 310         return -1;
 311
 312     int result = convertUnicode(m_current, char1, char2, char3);
 313     shift();
 314     shift();
 315     shift();
 316     shift();
 317     return result;
 318 }
 319
 320 void Lexer::shiftLineTerminator()
 321 {
 322     ASSERT(isLineTerminator(m_current));
 323
 324     int m_prev = m_current;
 325     shift();
 326
 327     // Allow both CRLF and LFCR.
 328     if (m_prev + m_current == '\n' + '\r')
 329         shift();
 330
 331     ++m_lineNumber;
 332 }
 333
 334 ALWAYS_INLINE bool Lexer::lastTokenWasRestrKeyword() const
 335 {
 336     return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
 337 }
 338
 339 static NEVER_INLINE bool isNonASCIIIdentStart(int c)
 340 {
 341     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other);
 342 }
 343
 344 static inline bool isIdentStart(int c)
 345 {
 346     return isASCII(c) ? typesOfASCIICharacters[c] == CharacterIdentifierStart : isNonASCIIIdentStart(c);
 347 }
 348
 349 static NEVER_INLINE bool isNonASCIIIdentPart(int c)
 350 {
 351     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
 352         | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector);
 353 }
 354
 355 static ALWAYS_INLINE bool isIdentPart(int c)
 356 {
 357     // Character types are divided into two groups depending on whether they can be part of an
 358     // identifier or not. Those whose type value is less or equal than CharacterNumber can be
 359     // part of an identifier. (See the CharacterType definition for more details.)
 360     return isASCII(c) ? typesOfASCIICharacters[c] <= CharacterNumber : isNonASCIIIdentPart(c);
 361 }
 362
 363 static inline int singleEscape(int c)
 364 {
 365     switch (c) {
 366     case 'b':
 367         return 0x08;
 368     case 't':
 369         return 0x09;
 370     case 'n':
 371         return 0x0A;
 372     case 'v':
 373         return 0x0B;
 374     case 'f':
 375         return 0x0C;
 376     case 'r':
 377         return 0x0D;
 378     case '\\':
 379         return '\\';
 380     case '\'':
 381         return '\'';
 382     case '"':
 383         return '"';
 384     default:
 385         return 0;
 386     }
 387 }
 388
 389 inline void Lexer::record8(int c)
 390 {
 391     ASSERT(c >= 0);
 392     ASSERT(c <= 0xFF);
 393     m_buffer8.append(static_cast<char>(c));
 394 }
 395
 396 inline void Lexer::record16(UChar c)
 397 {
 398     m_buffer16.append(c);
 399 }
 400
 401 inline void Lexer::record16(int c)
 402 {
 403     ASSERT(c >= 0);
 404     ASSERT(c <= USHRT_MAX);
 405     record16(UChar(static_cast<unsigned short>(c)));
 406 }
 407
 408 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer::parseIdentifier(JSTokenData* tokenData, unsigned lexType)
 409 {
 410     const ptrdiff_t remaining = m_codeEnd - m_code;
 411     if ((remaining >= maxTokenLength) && !(lexType & IgnoreReservedWords)) {
 412         JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
 413         if (keyword != IDENT) {
 414             ASSERT((!shouldCreateIdentifier) || tokenData->ident);
 415             return keyword;
 416         }
 417     }
 418     const UChar* identifierStart = currentCharacter();
 419     bool bufferRequired = false;
 420
 421     while (true) {
 422         if (LIKELY(isIdentPart(m_current))) {
 423             shift();
 424             continue;
 425         }
 426         if (LIKELY(m_current != '\\'))
 427             break;
 428
 429         // \uXXXX unicode characters.
 430         bufferRequired = true;
 431         if (identifierStart != currentCharacter())
 432             m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
 433         shift();
 434         if (UNLIKELY(m_current != 'u'))
 435             return ERRORTOK;
 436         shift();
 437         int character = getUnicodeCharacter();
 438         if (UNLIKELY(character == -1))
 439             return ERRORTOK;
 440         if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character) : !isIdentStart(character)))
 441             return ERRORTOK;
 442         if  (shouldCreateIdentifier)
 443             record16(character);
 444         identifierStart = currentCharacter();
 445     }
 446
 447     int identifierLength;
 448     const Identifier* ident = 0;
 449     if (shouldCreateIdentifier) {
 450         if (!bufferRequired)
 451             identifierLength = currentCharacter() - identifierStart;
 452         else {
 453             if (identifierStart != currentCharacter())
 454                 m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
 455             identifierStart = m_buffer16.data();
 456             identifierLength = m_buffer16.size();
 457         }
 458
 459         ident = makeIdentifier(identifierStart, identifierLength);
 460         tokenData->ident = ident;
 461     } else
 462         tokenData->ident = 0;
 463
 464     m_delimited = false;
 465
 466     if (LIKELY(!bufferRequired && !(lexType & IgnoreReservedWords))) {
 467         ASSERT(shouldCreateIdentifier);
 468         // Keywords must not be recognized if there was an \uXXXX in the identifier.
 469         if (remaining < maxTokenLength) {
 470             const HashEntry* entry = m_keywordTable.entry(m_globalData, *ident);
 471             ASSERT((remaining < maxTokenLength) || !entry);
 472             return entry ? static_cast<JSTokenType>(entry->lexerValue()) : IDENT;
 473         }
 474         return IDENT;
 475     }
 476
 477     m_buffer16.resize(0);
 478     return IDENT;
 479 }
 480
 481 bool Lexer::isKeyword(const Identifier& ident)
 482 {
 483     return m_keywordTable.entry(m_globalData, ident);
 484 }
 485
 486 template <bool shouldBuildStrings> ALWAYS_INLINE bool Lexer::parseString(JSTokenData* tokenData, bool strictMode)
 487 {
 488     int stringQuoteCharacter = m_current;
 489     shift();
 490
 491     const UChar* stringStart = currentCharacter();
 492
 493     while (m_current != stringQuoteCharacter) {
 494         if (UNLIKELY(m_current == '\\')) {
 495             if (stringStart != currentCharacter() && shouldBuildStrings)
 496                 m_buffer16.append(stringStart, currentCharacter() - stringStart);
 497             shift();
 498
 499             int escape = singleEscape(m_current);
 500
 501             // Most common escape sequences first
 502             if (escape) {
 503                  if (shouldBuildStrings)
 504                      record16(escape);
 505                 shift();
 506             } else if (UNLIKELY(isLineTerminator(m_current)))
 507                 shiftLineTerminator();
 508             else if (m_current == 'x') {
 509                 shift();
 510                 if (isASCIIHexDigit(m_current) && isASCIIHexDigit(peek(1))) {
 511                     int prev = m_current;
 512                     shift();
 513                     if (shouldBuildStrings)
 514                         record16(convertHex(prev, m_current));
 515                     shift();
 516                 } else if (shouldBuildStrings)
 517                     record16('x');
 518             } else if (m_current == 'u') {
 519                 shift();
 520                 int character = getUnicodeCharacter();
 521                 if (character != -1) {
 522                     if (shouldBuildStrings)
 523                         record16(character);
 524                 } else if (m_current == stringQuoteCharacter) {
 525                     if (shouldBuildStrings)
 526                         record16('u');
 527                 } else // Only stringQuoteCharacter allowed after \u
 528                     return false;
 529             } else if (strictMode && isASCIIDigit(m_current)) {
 530                 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
 531                 int character1 = m_current;
 532                 shift();
 533                 if (character1 != '0' || isASCIIDigit(m_current))
 534                     return false;
 535                 if (shouldBuildStrings)
 536                     record16(0);
 537             } else if (!strictMode && isASCIIOctalDigit(m_current)) {
 538                 // Octal character sequences
 539                 int character1 = m_current;
 540                 shift();
 541                 if (isASCIIOctalDigit(m_current)) {
 542                     // Two octal characters
 543                     int character2 = m_current;
 544                     shift();
 545                     if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
 546                         if (shouldBuildStrings)
 547                             record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
 548                         shift();
 549                     } else {
 550                         if (shouldBuildStrings)
 551                             record16((character1 - '0') * 8 + character2 - '0');
 552                     }
 553                 } else {
 554                     if (shouldBuildStrings)
 555                         record16(character1 - '0');
 556                 }
 557             } else if (m_current != -1) {
 558                 if (shouldBuildStrings)
 559                     record16(m_current);
 560                 shift();
 561             } else
 562                 return false;
 563
 564             stringStart = currentCharacter();
 565             continue;
 566         }
 567         // Fast check for characters that require special handling.
 568         // Catches -1, \n, \r, 0x2028, and 0x2029 as efficiently
 569         // as possible, and lets through all common ASCII characters.
 570         if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
 571             // New-line or end of input is not allowed
 572             if (UNLIKELY(isLineTerminator(m_current)) || UNLIKELY(m_current == -1))
 573                 return false;
 574             // Anything else is just a normal character
 575         }
 576         shift();
 577     }
 578
 579     if (currentCharacter() != stringStart && shouldBuildStrings)
 580         m_buffer16.append(stringStart, currentCharacter() - stringStart);
 581     if (shouldBuildStrings)
 582         tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
 583     else
 584         tokenData->ident = 0;
 585
 586     m_buffer16.resize(0);
 587     return true;
 588 }
 589
 590 ALWAYS_INLINE void Lexer::parseHex(double& returnValue)
 591 {
 592     // Optimization: most hexadecimal values fit into 4 bytes.
 593     uint32_t hexValue = 0;
 594     int maximumDigits = 7;
 595
 596     // Shift out the 'x' prefix.
 597     shift();
 598
 599     do {
 600         hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
 601         shift();
 602         --maximumDigits;
 603     } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
 604
 605     if (maximumDigits >= 0) {
 606         returnValue = hexValue;
 607         return;
 608     }
 609
 610     // No more place in the hexValue buffer.
 611     // The values are shifted out and placed into the m_buffer8 vector.
 612     for (int i = 0; i < 8; ++i) {
 613          int digit = hexValue >> 28;
 614          if (digit < 10)
 615              record8(digit + '0');
 616          else
 617              record8(digit - 10 + 'a');
 618          hexValue <<= 4;
 619     }
 620
 621     while (isASCIIHexDigit(m_current)) {
 622         record8(m_current);
 623         shift();
 624     }
 625
 626     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
 627 }
 628
 629 ALWAYS_INLINE bool Lexer::parseOctal(double& returnValue)
 630 {
 631     // Optimization: most octal values fit into 4 bytes.
 632     uint32_t octalValue = 0;
 633     int maximumDigits = 9;
 634     // Temporary buffer for the digits. Makes easier
 635     // to reconstruct the input characters when needed.
 636     char digits[10];
 637
 638     do {
 639         octalValue = octalValue * 8 + (m_current - '0');
 640         digits[maximumDigits] = m_current;
 641         shift();
 642         --maximumDigits;
 643     } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
 644
 645     if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
 646         returnValue = octalValue;
 647         return true;
 648     }
 649
 650     for (int i = 9; i > maximumDigits; --i)
 651          record8(digits[i]);
 652
 653     while (isASCIIOctalDigit(m_current)) {
 654         record8(m_current);
 655         shift();
 656     }
 657
 658     if (isASCIIDigit(m_current))
 659         return false;
 660
 661     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
 662     return true;
 663 }
 664
 665 ALWAYS_INLINE bool Lexer::parseDecimal(double& returnValue)
 666 {
 667     // Optimization: most decimal values fit into 4 bytes.
 668     uint32_t decimalValue = 0;
 669
 670     // Since parseOctal may be executed before parseDecimal,
 671     // the m_buffer8 may hold ascii digits.
 672     if (!m_buffer8.size()) {
 673         int maximumDigits = 9;
 674         // Temporary buffer for the digits. Makes easier
 675         // to reconstruct the input characters when needed.
 676         char digits[10];
 677
 678         do {
 679             decimalValue = decimalValue * 10 + (m_current - '0');
 680             digits[maximumDigits] = m_current;
 681             shift();
 682             --maximumDigits;
 683         } while (isASCIIDigit(m_current) && maximumDigits >= 0);
 684
 685         if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
 686             returnValue = decimalValue;
 687             return true;
 688         }
 689
 690         for (int i = 9; i > maximumDigits; --i)
 691             record8(digits[i]);
 692     }
 693
 694     while (isASCIIDigit(m_current)) {
 695         record8(m_current);
 696         shift();
 697     }
 698
 699     return false;
 700 }
 701
 702 ALWAYS_INLINE void Lexer::parseNumberAfterDecimalPoint()
 703 {
 704     record8('.');
 705     while (isASCIIDigit(m_current)) {
 706         record8(m_current);
 707         shift();
 708     }
 709 }
 710
 711 ALWAYS_INLINE bool Lexer::parseNumberAfterExponentIndicator()
 712 {
 713     record8('e');
 714     shift();
 715     if (m_current == '+' || m_current == '-') {
 716         record8(m_current);
 717         shift();
 718     }
 719
 720     if (!isASCIIDigit(m_current))
 721         return false;
 722
 723     do {
 724         record8(m_current);
 725         shift();
 726     } while (isASCIIDigit(m_current));
 727     return true;
 728 }
 729
 730 ALWAYS_INLINE bool Lexer::parseMultilineComment()
 731 {
 732     while (true) {
 733         while (UNLIKELY(m_current == '*')) {
 734             shift();
 735             if (m_current == '/') {
 736                 shift();
 737                 return true;
 738             }
 739         }
 740
 741         if (UNLIKELY(m_current == -1))
 742             return false;
 743
 744         if (isLineTerminator(m_current))
 745             shiftLineTerminator();
 746         else
 747             shift();
 748     }
 749 }
 750
 751 bool Lexer::nextTokenIsColon()
 752 {
 753     const UChar* code = m_code;
 754     while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
 755         code++;
 756
 757     return code < m_codeEnd && *code == ':';
 758 }
 759
 760 JSTokenType Lexer::lex(JSTokenData* tokenData, JSTokenInfo* tokenInfo, unsigned lexType, bool strictMode)
 761 {
 762     ASSERT(!m_error);
 763     ASSERT(m_buffer8.isEmpty());
 764     ASSERT(m_buffer16.isEmpty());
 765
 766     JSTokenType token = ERRORTOK;
 767     m_terminator = false;
 768
 769 start:
 770     while (isWhiteSpace(m_current))
 771         shift();
 772
 773     int startOffset = currentOffset();
 774
 775     if (UNLIKELY(m_current == -1))
 776         return EOFTOK;
 777
 778     m_delimited = false;
 779
 780     CharacterType type;
 781     if (LIKELY(isASCII(m_current)))
 782         type = static_cast<CharacterType>(typesOfASCIICharacters[m_current]);
 783     else if (isNonASCIIIdentStart(m_current))
 784         type = CharacterIdentifierStart;
 785     else if (isLineTerminator(m_current))
 786         type = CharacterLineTerminator;
 787     else
 788         type = CharacterInvalid;
 789
 790     switch (type) {
 791     case CharacterGreater:
 792         shift();
 793         if (m_current == '>') {
 794             shift();
 795             if (m_current == '>') {
 796                 shift();
 797                 if (m_current == '=') {
 798                     shift();
 799                     token = URSHIFTEQUAL;
 800                     break;
 801                 }
 802                 token = URSHIFT;
 803                 break;
 804             }
 805             if (m_current == '=') {
 806                 shift();
 807                 token = RSHIFTEQUAL;
 808                 break;
 809             }
 810             token = RSHIFT;
 811             break;
 812         }
 813         if (m_current == '=') {
 814             shift();
 815             token = GE;
 816             break;
 817         }
 818         token = GT;
 819         break;
 820     case CharacterEqual:
 821         shift();
 822         if (m_current == '=') {
 823             shift();
 824             if (m_current == '=') {
 825                 shift();
 826                 token = STREQ;
 827                 break;
 828             }
 829             token = EQEQ;
 830             break;
 831         }
 832         token = EQUAL;
 833         break;
 834     case CharacterLess:
 835         shift();
 836         if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
 837             // <!-- marks the beginning of a line comment (for www usage)
 838             goto inSingleLineComment;
 839         }
 840         if (m_current == '<') {
 841             shift();
 842             if (m_current == '=') {
 843                 shift();
 844                 token = LSHIFTEQUAL;
 845                 break;
 846             }
 847             token = LSHIFT;
 848             break;
 849         }
 850         if (m_current == '=') {
 851             shift();
 852             token = LE;
 853             break;
 854         }
 855         token = LT;
 856         break;
 857     case CharacterExclamationMark:
 858         shift();
 859         if (m_current == '=') {
 860             shift();
 861             if (m_current == '=') {
 862                 shift();
 863                 token = STRNEQ;
 864                 break;
 865             }
 866             token = NE;
 867             break;
 868         }
 869         token = EXCLAMATION;
 870         break;
 871     case CharacterAdd:
 872         shift();
 873         if (m_current == '+') {
 874             shift();
 875             token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
 876             break;
 877         }
 878         if (m_current == '=') {
 879             shift();
 880             token = PLUSEQUAL;
 881             break;
 882         }
 883         token = PLUS;
 884         break;
 885     case CharacterSub:
 886         shift();
 887         if (m_current == '-') {
 888             shift();
 889             if (m_atLineStart && m_current == '>') {
 890                 shift();
 891                 goto inSingleLineComment;
 892             }
 893             token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
 894             break;
 895         }
 896         if (m_current == '=') {
 897             shift();
 898             token = MINUSEQUAL;
 899             break;
 900         }
 901         token = MINUS;
 902         break;
 903     case CharacterMultiply:
 904         shift();
 905         if (m_current == '=') {
 906             shift();
 907             token = MULTEQUAL;
 908             break;
 909         }
 910         token = TIMES;
 911         break;
 912     case CharacterSlash:
 913         shift();
 914         if (m_current == '/') {
 915             shift();
 916             goto inSingleLineComment;
 917         }
 918         if (m_current == '*') {
 919             shift();
 920             if (parseMultilineComment())
 921                 goto start;
 922             goto returnError;
 923         }
 924         if (m_current == '=') {
 925             shift();
 926             token = DIVEQUAL;
 927             break;
 928         }
 929         token = DIVIDE;
 930         break;
 931     case CharacterAnd:
 932         shift();
 933         if (m_current == '&') {
 934             shift();
 935             token = AND;
 936             break;
 937         }
 938         if (m_current == '=') {
 939             shift();
 940             token = ANDEQUAL;
 941             break;
 942         }
 943         token = BITAND;
 944         break;
 945     case CharacterXor:
 946         shift();
 947         if (m_current == '=') {
 948             shift();
 949             token = XOREQUAL;
 950             break;
 951         }
 952         token = BITXOR;
 953         break;
 954     case CharacterModulo:
 955         shift();
 956         if (m_current == '=') {
 957             shift();
 958             token = MODEQUAL;
 959             break;
 960         }
 961         token = MOD;
 962         break;
 963     case CharacterOr:
 964         shift();
 965         if (m_current == '=') {
 966             shift();
 967             token = OREQUAL;
 968             break;
 969         }
 970         if (m_current == '|') {
 971             shift();
 972             token = OR;
 973             break;
 974         }
 975         token = BITOR;
 976         break;
 977     case CharacterOpenParen:
 978         token = OPENPAREN;
 979         shift();
 980         break;
 981     case CharacterCloseParen:
 982         token = CLOSEPAREN;
 983         shift();
 984         break;
 985     case CharacterOpenBracket:
 986         token = OPENBRACKET;
 987         shift();
 988         break;
 989     case CharacterCloseBracket:
 990         token = CLOSEBRACKET;
 991         shift();
 992         break;
 993     case CharacterComma:
 994         token = COMMA;
 995         shift();
 996         break;
 997     case CharacterColon:
 998         token = COLON;
 999         shift();
1000         break;
1001     case CharacterQuestion:
1002         token = QUESTION;
1003         shift();
1004         break;
1005     case CharacterTilde:
1006         token = TILDE;
1007         shift();
1008         break;
1009     case CharacterSemicolon:
1010         m_delimited = true;
1011         shift();
1012         token = SEMICOLON;
1013         break;
1014     case CharacterOpenBrace:
1015         tokenData->intValue = currentOffset();
1016         shift();
1017         token = OPENBRACE;
1018         break;
1019     case CharacterCloseBrace:
1020         tokenData->intValue = currentOffset();
1021         m_delimited = true;
1022         shift();
1023         token = CLOSEBRACE;
1024         break;
1025     case CharacterDot:
1026         shift();
1027         if (!isASCIIDigit(m_current)) {
1028             token = DOT;
1029             break;
1030         }
1031         goto inNumberAfterDecimalPoint;
1032     case CharacterZero:
1033         shift();
1034         if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) {
1035             parseHex(tokenData->doubleValue);
1036             token = NUMBER;
1037         } else {
1038             record8('0');
1039             if (isASCIIOctalDigit(m_current)) {
1040                 if (parseOctal(tokenData->doubleValue)) {
1041                     if (strictMode)
1042                         goto returnError;
1043                     token = NUMBER;
1044                 }
1045             }
1046         }
1047         // Fall through into CharacterNumber
1048     case CharacterNumber:
1049         if (LIKELY(token != NUMBER)) {
1050             if (!parseDecimal(tokenData->doubleValue)) {
1051                 if (m_current == '.') {
1052                     shift();
1053 inNumberAfterDecimalPoint:
1054                     parseNumberAfterDecimalPoint();
1055                 }
1056                 if ((m_current | 0x20) == 'e')
1057                     if (!parseNumberAfterExponentIndicator())
1058                         goto returnError;
1059                 // Null-terminate string for strtod.
1060                 m_buffer8.append('\0');
1061                 tokenData->doubleValue = WTF::strtod(m_buffer8.data(), 0);
1062             }
1063             token = NUMBER;
1064         }
1065
1066         // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
1067         if (UNLIKELY(isIdentStart(m_current)))
1068             goto returnError;
1069         m_buffer8.resize(0);
1070         m_delimited = false;
1071         break;
1072     case CharacterQuote:
1073         if (lexType & DontBuildStrings) {
1074             if (UNLIKELY(!parseString<false>(tokenData, strictMode)))
1075                 goto returnError;
1076         } else {
1077             if (UNLIKELY(!parseString<true>(tokenData, strictMode)))
1078                 goto returnError;
1079         }
1080         shift();
1081         m_delimited = false;
1082         token = STRING;
1083         break;
1084     case CharacterIdentifierStart:
1085         ASSERT(isIdentStart(m_current));
1086         // Fall through into CharacterBackSlash.
1087     case CharacterBackSlash:
1088         if (lexType & DontBuildKeywords)
1089             token = parseIdentifier<false>(tokenData, lexType);
1090         else
1091             token = parseIdentifier<true>(tokenData, lexType);
1092         break;
1093     case CharacterLineTerminator:
1094         ASSERT(isLineTerminator(m_current));
1095         shiftLineTerminator();
1096         m_atLineStart = true;
1097         m_terminator = true;
1098         goto start;
1099     case CharacterInvalid:
1100         goto returnError;
1101     default:
1102         ASSERT_NOT_REACHED();
1103         goto returnError;
1104     }
1105
1106     m_atLineStart = false;
1107     goto returnToken;
1108
1109 inSingleLineComment:
1110     while (!isLineTerminator(m_current)) {
1111         if (UNLIKELY(m_current == -1))
1112             return EOFTOK;
1113         shift();
1114     }
1115     shiftLineTerminator();
1116     m_atLineStart = true;
1117     m_terminator = true;
1118     if (!lastTokenWasRestrKeyword())
1119         goto start;
1120
1121     token = SEMICOLON;
1122     m_delimited = true;
1123     // Fall through into returnToken.
1124
1125 returnToken:
1126     tokenInfo->line = m_lineNumber;
1127     tokenInfo->startOffset = startOffset;
1128     tokenInfo->endOffset = currentOffset();
1129     m_lastToken = token;
1130     return token;
1131
1132 returnError:
1133     m_error = true;
1134     return ERRORTOK;
1135 }
1136
1137 bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
1138 {
1139     ASSERT(m_buffer16.isEmpty());
1140
1141     bool lastWasEscape = false;
1142     bool inBrackets = false;
1143
1144     if (patternPrefix) {
1145         ASSERT(!isLineTerminator(patternPrefix));
1146         ASSERT(patternPrefix != '/');
1147         ASSERT(patternPrefix != '[');
1148         record16(patternPrefix);
1149     }
1150
1151     while (true) {
1152         int current = m_current;
1153
1154         if (isLineTerminator(current) || current == -1) {
1155             m_buffer16.resize(0);
1156             return false;
1157         }
1158
1159         shift();
1160
1161         if (current == '/' && !lastWasEscape && !inBrackets)
1162             break;
1163
1164         record16(current);
1165
1166         if (lastWasEscape) {
1167             lastWasEscape = false;
1168             continue;
1169         }
1170
1171         switch (current) {
1172         case '[':
1173             inBrackets = true;
1174             break;
1175         case ']':
1176             inBrackets = false;
1177             break;
1178         case '\\':
1179             lastWasEscape = true;
1180             break;
1181         }
1182     }
1183
1184     pattern = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1185     m_buffer16.resize(0);
1186
1187     while (isIdentPart(m_current)) {
1188         record16(m_current);
1189         shift();
1190     }
1191
1192     flags = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1193     m_buffer16.resize(0);
1194
1195     return true;
1196 }
1197
1198 bool Lexer::skipRegExp()
1199 {
1200     bool lastWasEscape = false;
1201     bool inBrackets = false;
1202
1203     while (true) {
1204         int current = m_current;
1205
1206         if (isLineTerminator(current) || current == -1)
1207             return false;
1208
1209         shift();
1210
1211         if (current == '/' && !lastWasEscape && !inBrackets)
1212             break;
1213
1214         if (lastWasEscape) {
1215             lastWasEscape = false;
1216             continue;
1217         }
1218
1219         switch (current) {
1220         case '[':
1221             inBrackets = true;
1222             break;
1223         case ']':
1224             inBrackets = false;
1225             break;
1226         case '\\':
1227             lastWasEscape = true;
1228             break;
1229         }
1230     }
1231
1232     while (isIdentPart(m_current))
1233         shift();
1234
1235     return true;
1236 }
1237
1238 void Lexer::clear()
1239 {
1240     m_arena = 0;
1241
1242     Vector<char> newBuffer8;
1243     m_buffer8.swap(newBuffer8);
1244
1245     Vector<UChar> newBuffer16;
1246     m_buffer16.swap(newBuffer16);
1247
1248     m_isReparsing = false;
1249 }
1250
1251 SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine)
1252 {
1253     ASSERT(m_source->provider()->data()[openBrace] == '{');
1254     ASSERT(m_source->provider()->data()[closeBrace] == '}');
1255     return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine);
1256 }
1257
1258 } // namespace JSC