parser/Lexer.cpp

   1 /*
   2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   3  *  Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
   4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
   5  *
   6  *  This library is free software; you can redistribute it and/or
   7  *  modify it under the terms of the GNU Library General Public
   8  *  License as published by the Free Software Foundation; either
   9  *  version 2 of the License, or (at your option) any later version.
  10  *
  11  *  This library is distributed in the hope that it will be useful,
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  *  Library General Public License for more details.
  15  *
  16  *  You should have received a copy of the GNU Library General Public License
  17  *  along with this library; see the file COPYING.LIB.  If not, write to
  18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19  *  Boston, MA 02110-1301, USA.
  20  *
  21  */
  22
  23 #include "config.h"
  24 #include "Lexer.h"
  25
  26 #include "JSFunction.h"
  27 #include "JSGlobalObjectFunctions.h"
  28 #include "NodeInfo.h"
  29 #include "Nodes.h"
  30 #include "dtoa.h"
  31 #include <ctype.h>
  32 #include <limits.h>
  33 #include <string.h>
  34 #include <wtf/Assertions.h>
  35
  36 using namespace WTF;
  37 using namespace Unicode;
  38
  39 // We can't specify the namespace in yacc's C output, so do it here instead.
  40 using namespace JSC;
  41
  42 #ifndef KDE_USE_FINAL
  43 #include "Grammar.h"
  44 #endif
  45
  46 #include "Lookup.h"
  47 #include "Lexer.lut.h"
  48
  49 // A bridge for yacc from the C world to the C++ world.
  50 int jscyylex(void* lvalp, void* llocp, void* globalData)
  51 {
  52     return static_cast<JSGlobalData*>(globalData)->lexer->lex(lvalp, llocp);
  53 }
  54
  55 namespace JSC {
  56
  57 static const UChar byteOrderMark = 0xFEFF;
  58
  59 Lexer::Lexer(JSGlobalData* globalData)
  60     : m_isReparsing(false)
  61     , m_globalData(globalData)
  62     , m_keywordTable(JSC::mainTable)
  63 {
  64     m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
  65     m_buffer16.reserveInitialCapacity(initialReadBufferCapacity);
  66 }
  67
  68 Lexer::~Lexer()
  69 {
  70     m_keywordTable.deleteTable();
  71 }
  72
  73 inline const UChar* Lexer::currentCharacter() const
  74 {
  75     return m_code - 4;
  76 }
  77
  78 inline int Lexer::currentOffset() const
  79 {
  80     return currentCharacter() - m_codeStart;
  81 }
  82
  83 ALWAYS_INLINE void Lexer::shift1()
  84 {
  85     m_current = m_next1;
  86     m_next1 = m_next2;
  87     m_next2 = m_next3;
  88     if (LIKELY(m_code < m_codeEnd))
  89         m_next3 = m_code[0];
  90     else
  91         m_next3 = -1;
  92
  93     ++m_code;
  94 }
  95
  96 ALWAYS_INLINE void Lexer::shift2()
  97 {
  98     m_current = m_next2;
  99     m_next1 = m_next3;
 100     if (LIKELY(m_code + 1 < m_codeEnd)) {
 101         m_next2 = m_code[0];
 102         m_next3 = m_code[1];
 103     } else {
 104         m_next2 = m_code < m_codeEnd ? m_code[0] : -1;
 105         m_next3 = -1;
 106     }
 107
 108     m_code += 2;
 109 }
 110
 111 ALWAYS_INLINE void Lexer::shift3()
 112 {
 113     m_current = m_next3;
 114     if (LIKELY(m_code + 2 < m_codeEnd)) {
 115         m_next1 = m_code[0];
 116         m_next2 = m_code[1];
 117         m_next3 = m_code[2];
 118     } else {
 119         m_next1 = m_code < m_codeEnd ? m_code[0] : -1;
 120         m_next2 = m_code + 1 < m_codeEnd ? m_code[1] : -1;
 121         m_next3 = -1;
 122     }
 123
 124     m_code += 3;
 125 }
 126
 127 ALWAYS_INLINE void Lexer::shift4()
 128 {
 129     if (LIKELY(m_code + 3 < m_codeEnd)) {
 130         m_current = m_code[0];
 131         m_next1 = m_code[1];
 132         m_next2 = m_code[2];
 133         m_next3 = m_code[3];
 134     } else {
 135         m_current = m_code < m_codeEnd ? m_code[0] : -1;
 136         m_next1 = m_code + 1 < m_codeEnd ? m_code[1] : -1;
 137         m_next2 = m_code + 2 < m_codeEnd ? m_code[2] : -1;
 138         m_next3 = -1;
 139     }
 140
 141     m_code += 4;
 142 }
 143
 144 void Lexer::setCode(const SourceCode& source)
 145 {
 146     m_lineNumber = source.firstLine();
 147     m_delimited = false;
 148     m_lastToken = -1;
 149
 150     const UChar* data = source.provider()->data();
 151
 152     m_source = &source;
 153     m_codeStart = data;
 154     m_code = data + source.startOffset();
 155     m_codeEnd = data + source.endOffset();
 156     m_error = false;
 157     m_atLineStart = true;
 158
 159     // ECMA-262 calls for stripping all Cf characters, but we only strip BOM characters.
 160     // See <https://bugs.webkit.org/show_bug.cgi?id=4931> for details.
 161     if (source.provider()->hasBOMs()) {
 162         for (const UChar* p = m_codeStart; p < m_codeEnd; ++p) {
 163             if (UNLIKELY(*p == byteOrderMark)) {
 164                 copyCodeWithoutBOMs();
 165                 break;
 166             }
 167         }
 168     }
 169
 170     // Read the first characters into the 4-character buffer.
 171     shift4();
 172     ASSERT(currentOffset() == source.startOffset());
 173 }
 174
 175 void Lexer::copyCodeWithoutBOMs()
 176 {
 177     // Note: In this case, the character offset data for debugging will be incorrect.
 178     // If it's important to correctly debug code with extraneous BOMs, then the caller
 179     // should strip the BOMs when creating the SourceProvider object and do its own
 180     // mapping of offsets within the stripped text to original text offset.
 181
 182     m_codeWithoutBOMs.reserveCapacity(m_codeEnd - m_code);
 183     for (const UChar* p = m_code; p < m_codeEnd; ++p) {
 184         UChar c = *p;
 185         if (c != byteOrderMark)
 186             m_codeWithoutBOMs.append(c);
 187     }
 188     ptrdiff_t startDelta = m_codeStart - m_code;
 189     m_code = m_codeWithoutBOMs.data();
 190     m_codeStart = m_code + startDelta;
 191     m_codeEnd = m_codeWithoutBOMs.data() + m_codeWithoutBOMs.size();
 192 }
 193
 194 void Lexer::shiftLineTerminator()
 195 {
 196     ASSERT(isLineTerminator(m_current));
 197
 198     // Allow both CRLF and LFCR.
 199     if (m_current + m_next1 == '\n' + '\r')
 200         shift2();
 201     else
 202         shift1();
 203
 204     ++m_lineNumber;
 205 }
 206
 207 ALWAYS_INLINE Identifier* Lexer::makeIdentifier(const UChar* characters, size_t length)
 208 {
 209     m_identifiers.append(Identifier(m_globalData, characters, length));
 210     return &m_identifiers.last();
 211 }
 212
 213 inline bool Lexer::lastTokenWasRestrKeyword() const
 214 {
 215     return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
 216 }
 217
 218 static NEVER_INLINE bool isNonASCIIIdentStart(int c)
 219 {
 220     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other);
 221 }
 222
 223 static inline bool isIdentStart(int c)
 224 {
 225     return isASCII(c) ? isASCIIAlpha(c) || c == '$' || c == '_' : isNonASCIIIdentStart(c);
 226 }
 227
 228 static NEVER_INLINE bool isNonASCIIIdentPart(int c)
 229 {
 230     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
 231         | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector);
 232 }
 233
 234 static inline bool isIdentPart(int c)
 235 {
 236     return isASCII(c) ? isASCIIAlphanumeric(c) || c == '$' || c == '_' : isNonASCIIIdentPart(c);
 237 }
 238
 239 static inline int singleEscape(int c)
 240 {
 241     switch (c) {
 242         case 'b':
 243             return 0x08;
 244         case 't':
 245             return 0x09;
 246         case 'n':
 247             return 0x0A;
 248         case 'v':
 249             return 0x0B;
 250         case 'f':
 251             return 0x0C;
 252         case 'r':
 253             return 0x0D;
 254         default:
 255             return c;
 256     }
 257 }
 258
 259 inline void Lexer::record8(int c)
 260 {
 261     ASSERT(c >= 0);
 262     ASSERT(c <= 0xFF);
 263     m_buffer8.append(static_cast<char>(c));
 264 }
 265
 266 inline void Lexer::record16(UChar c)
 267 {
 268     m_buffer16.append(c);
 269 }
 270
 271 inline void Lexer::record16(int c)
 272 {
 273     ASSERT(c >= 0);
 274     ASSERT(c <= USHRT_MAX);
 275     record16(UChar(static_cast<unsigned short>(c)));
 276 }
 277
 278 int Lexer::lex(void* p1, void* p2)
 279 {
 280     ASSERT(!m_error);
 281     ASSERT(m_buffer8.isEmpty());
 282     ASSERT(m_buffer16.isEmpty());
 283
 284     YYSTYPE* lvalp = static_cast<YYSTYPE*>(p1);
 285     YYLTYPE* llocp = static_cast<YYLTYPE*>(p2);
 286     int token = 0;
 287     m_terminator = false;
 288
 289 start:
 290     while (isWhiteSpace(m_current))
 291         shift1();
 292
 293     int startOffset = currentOffset();
 294
 295     if (m_current == -1) {
 296         if (!m_terminator && !m_delimited && !m_isReparsing) {
 297             // automatic semicolon insertion if program incomplete
 298             token = ';';
 299             goto doneSemicolon;
 300         }
 301         return 0;
 302     }
 303
 304     m_delimited = false;
 305     switch (m_current) {
 306         case '>':
 307             if (m_next1 == '>' && m_next2 == '>') {
 308                 if (m_next3 == '=') {
 309                     shift4();
 310                     token = URSHIFTEQUAL;
 311                     break;
 312                 }
 313                 shift3();
 314                 token = URSHIFT;
 315                 break;
 316             }
 317             if (m_next1 == '>') {
 318                 if (m_next2 == '=') {
 319                     shift3();
 320                     token = RSHIFTEQUAL;
 321                     break;
 322                 }
 323                 shift2();
 324                 token = RSHIFT;
 325                 break;
 326             }
 327             if (m_next1 == '=') {
 328                 shift2();
 329                 token = GE;
 330                 break;
 331             }
 332             shift1();
 333             token = '>';
 334             break;
 335         case '=':
 336             if (m_next1 == '=') {
 337                 if (m_next2 == '=') {
 338                     shift3();
 339                     token = STREQ;
 340                     break;
 341                 }
 342                 shift2();
 343                 token = EQEQ;
 344                 break;
 345             }
 346             shift1();
 347             token = '=';
 348             break;
 349         case '!':
 350             if (m_next1 == '=') {
 351                 if (m_next2 == '=') {
 352                     shift3();
 353                     token = STRNEQ;
 354                     break;
 355                 }
 356                 shift2();
 357                 token = NE;
 358                 break;
 359             }
 360             shift1();
 361             token = '!';
 362             break;
 363         case '<':
 364             if (m_next1 == '!' && m_next2 == '-' && m_next3 == '-') {
 365                 // <!-- marks the beginning of a line comment (for www usage)
 366                 shift4();
 367                 goto inSingleLineComment;
 368             }
 369             if (m_next1 == '<') {
 370                 if (m_next2 == '=') {
 371                     shift3();
 372                     token = LSHIFTEQUAL;
 373                     break;
 374                 }
 375                 shift2();
 376                 token = LSHIFT;
 377                 break;
 378             }
 379             if (m_next1 == '=') {
 380                 shift2();
 381                 token = LE;
 382                 break;
 383             }
 384             shift1();
 385             token = '<';
 386             break;
 387         case '+':
 388             if (m_next1 == '+') {
 389                 shift2();
 390                 if (m_terminator) {
 391                     token = AUTOPLUSPLUS;
 392                     break;
 393                 }
 394                 token = PLUSPLUS;
 395                 break;
 396             }
 397             if (m_next1 == '=') {
 398                 shift2();
 399                 token = PLUSEQUAL;
 400                 break;
 401             }
 402             shift1();
 403             token = '+';
 404             break;
 405         case '-':
 406             if (m_next1 == '-') {
 407                 if (m_atLineStart && m_next2 == '>') {
 408                     shift3();
 409                     goto inSingleLineComment;
 410                 }
 411                 shift2();
 412                 if (m_terminator) {
 413                     token = AUTOMINUSMINUS;
 414                     break;
 415                 }
 416                 token = MINUSMINUS;
 417                 break;
 418             }
 419             if (m_next1 == '=') {
 420                 shift2();
 421                 token = MINUSEQUAL;
 422                 break;
 423             }
 424             shift1();
 425             token = '-';
 426             break;
 427         case '*':
 428             if (m_next1 == '=') {
 429                 shift2();
 430                 token = MULTEQUAL;
 431                 break;
 432             }
 433             shift1();
 434             token = '*';
 435             break;
 436         case '/':
 437             if (m_next1 == '/') {
 438                 shift2();
 439                 goto inSingleLineComment;
 440             }
 441             if (m_next1 == '*')
 442                 goto inMultiLineComment;
 443             if (m_next1 == '=') {
 444                 shift2();
 445                 token = DIVEQUAL;
 446                 break;
 447             }
 448             shift1();
 449             token = '/';
 450             break;
 451         case '&':
 452             if (m_next1 == '&') {
 453                 shift2();
 454                 token = AND;
 455                 break;
 456             }
 457             if (m_next1 == '=') {
 458                 shift2();
 459                 token = ANDEQUAL;
 460                 break;
 461             }
 462             shift1();
 463             token = '&';
 464             break;
 465         case '^':
 466             if (m_next1 == '=') {
 467                 shift2();
 468                 token = XOREQUAL;
 469                 break;
 470             }
 471             shift1();
 472             token = '^';
 473             break;
 474         case '%':
 475             if (m_next1 == '=') {
 476                 shift2();
 477                 token = MODEQUAL;
 478                 break;
 479             }
 480             shift1();
 481             token = '%';
 482             break;
 483         case '|':
 484             if (m_next1 == '=') {
 485                 shift2();
 486                 token = OREQUAL;
 487                 break;
 488             }
 489             if (m_next1 == '|') {
 490                 shift2();
 491                 token = OR;
 492                 break;
 493             }
 494             shift1();
 495             token = '|';
 496             break;
 497         case '.':
 498             if (isASCIIDigit(m_next1)) {
 499                 record8('.');
 500                 shift1();
 501                 goto inNumberAfterDecimalPoint;
 502             }
 503             token = '.';
 504             shift1();
 505             break;
 506         case ',':
 507         case '~':
 508         case '?':
 509         case ':':
 510         case '(':
 511         case ')':
 512         case '[':
 513         case ']':
 514             token = m_current;
 515             shift1();
 516             break;
 517         case ';':
 518             shift1();
 519             m_delimited = true;
 520             token = ';';
 521             break;
 522         case '{':
 523             lvalp->intValue = currentOffset();
 524             shift1();
 525             token = OPENBRACE;
 526             break;
 527         case '}':
 528             lvalp->intValue = currentOffset();
 529             shift1();
 530             m_delimited = true;
 531             token = CLOSEBRACE;
 532             break;
 533         case '\\':
 534             goto startIdentifierWithBackslash;
 535         case '0':
 536             goto startNumberWithZeroDigit;
 537         case '1':
 538         case '2':
 539         case '3':
 540         case '4':
 541         case '5':
 542         case '6':
 543         case '7':
 544         case '8':
 545         case '9':
 546             goto startNumber;
 547         case '"':
 548         case '\'':
 549             goto startString;
 550         default:
 551             if (isIdentStart(m_current))
 552                 goto startIdentifierOrKeyword;
 553             if (isLineTerminator(m_current)) {
 554                 shiftLineTerminator();
 555                 m_atLineStart = true;
 556                 m_terminator = true;
 557                 if (lastTokenWasRestrKeyword()) {
 558                     token = ';';
 559                     goto doneSemicolon;
 560                 }
 561                 goto start;
 562             }
 563             goto returnError;
 564     }
 565
 566     m_atLineStart = false;
 567     goto returnToken;
 568
 569 startString: {
 570     int stringQuoteCharacter = m_current;
 571     shift1();
 572
 573     const UChar* stringStart = currentCharacter();
 574     while (m_current != stringQuoteCharacter) {
 575         // Fast check for characters that require special handling.
 576         // Catches -1, \n, \r, \, 0x2028, and 0x2029 as efficiently
 577         // as possible, and lets through all common ASCII characters.
 578         if (UNLIKELY(m_current == '\\') || UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
 579             m_buffer16.append(stringStart, currentCharacter() - stringStart);
 580             goto inString;
 581         }
 582         shift1();
 583     }
 584     lvalp->ident = makeIdentifier(stringStart, currentCharacter() - stringStart);
 585     shift1();
 586     m_atLineStart = false;
 587     m_delimited = false;
 588     token = STRING;
 589     goto returnToken;
 590
 591 inString:
 592     while (m_current != stringQuoteCharacter) {
 593         if (m_current == '\\')
 594             goto inStringEscapeSequence;
 595         if (UNLIKELY(isLineTerminator(m_current)))
 596             goto returnError;
 597         if (UNLIKELY(m_current == -1))
 598             goto returnError;
 599         record16(m_current);
 600         shift1();
 601     }
 602     goto doneString;
 603
 604 inStringEscapeSequence:
 605     shift1();
 606     if (m_current == 'x') {
 607         shift1();
 608         if (isASCIIHexDigit(m_current) && isASCIIHexDigit(m_next1)) {
 609             record16(convertHex(m_current, m_next1));
 610             shift2();
 611             goto inString;
 612         }
 613         record16('x');
 614         if (m_current == stringQuoteCharacter)
 615             goto doneString;
 616         goto inString;
 617     }
 618     if (m_current == 'u') {
 619         shift1();
 620         if (isASCIIHexDigit(m_current) && isASCIIHexDigit(m_next1) && isASCIIHexDigit(m_next2) && isASCIIHexDigit(m_next3)) {
 621             record16(convertUnicode(m_current, m_next1, m_next2, m_next3));
 622             shift4();
 623             goto inString;
 624         }
 625         if (m_current == stringQuoteCharacter) {
 626             record16('u');
 627             goto doneString;
 628         }
 629         goto returnError;
 630     }
 631     if (isASCIIOctalDigit(m_current)) {
 632         if (m_current >= '0' && m_current <= '3' && isASCIIOctalDigit(m_next1) && isASCIIOctalDigit(m_next2)) {
 633             record16((m_current - '0') * 64 + (m_next1 - '0') * 8 + m_next2 - '0');
 634             shift3();
 635             goto inString;
 636         }
 637         if (isASCIIOctalDigit(m_next1)) {
 638             record16((m_current - '0') * 8 + m_next1 - '0');
 639             shift2();
 640             goto inString;
 641         }
 642         record16(m_current - '0');
 643         shift1();
 644         goto inString;
 645     }
 646     if (isLineTerminator(m_current)) {
 647         shiftLineTerminator();
 648         goto inString;
 649     }
 650     record16(singleEscape(m_current));
 651     shift1();
 652     goto inString;
 653 }
 654
 655 startIdentifierWithBackslash:
 656     shift1();
 657     if (UNLIKELY(m_current != 'u'))
 658         goto returnError;
 659     shift1();
 660     if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(m_next1) || !isASCIIHexDigit(m_next2) || !isASCIIHexDigit(m_next3)))
 661         goto returnError;
 662     token = convertUnicode(m_current, m_next1, m_next2, m_next3);
 663     if (UNLIKELY(!isIdentStart(token)))
 664         goto returnError;
 665     goto inIdentifierAfterCharacterCheck;
 666
 667 startIdentifierOrKeyword: {
 668     const UChar* identifierStart = currentCharacter();
 669     shift1();
 670     while (isIdentPart(m_current))
 671         shift1();
 672     if (LIKELY(m_current != '\\')) {
 673         lvalp->ident = makeIdentifier(identifierStart, currentCharacter() - identifierStart);
 674         goto doneIdentifierOrKeyword;
 675     }
 676     m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
 677 }
 678
 679     do {
 680         shift1();
 681         if (UNLIKELY(m_current != 'u'))
 682             goto returnError;
 683         shift1();
 684         if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(m_next1) || !isASCIIHexDigit(m_next2) || !isASCIIHexDigit(m_next3)))
 685             goto returnError;
 686         token = convertUnicode(m_current, m_next1, m_next2, m_next3);
 687         if (UNLIKELY(!isIdentPart(token)))
 688             goto returnError;
 689 inIdentifierAfterCharacterCheck:
 690         record16(token);
 691         shift4();
 692
 693         while (isIdentPart(m_current)) {
 694             record16(m_current);
 695             shift1();
 696         }
 697     } while (UNLIKELY(m_current == '\\'));
 698     goto doneIdentifier;
 699
 700 inSingleLineComment:
 701     while (!isLineTerminator(m_current)) {
 702         if (UNLIKELY(m_current == -1))
 703             return 0;
 704         shift1();
 705     }
 706     shiftLineTerminator();
 707     m_atLineStart = true;
 708     m_terminator = true;
 709     if (lastTokenWasRestrKeyword())
 710         goto doneSemicolon;
 711     goto start;
 712
 713 inMultiLineComment:
 714     shift2();
 715     while (m_current != '*' || m_next1 != '/') {
 716         if (isLineTerminator(m_current))
 717             shiftLineTerminator();
 718         else {
 719             shift1();
 720             if (UNLIKELY(m_current == -1))
 721                 goto returnError;
 722         }
 723     }
 724     shift2();
 725     m_atLineStart = false;
 726     goto start;
 727
 728 startNumberWithZeroDigit:
 729     shift1();
 730     if ((m_current | 0x20) == 'x' && isASCIIHexDigit(m_next1)) {
 731         shift1();
 732         goto inHex;
 733     }
 734     if (m_current == '.') {
 735         record8('0');
 736         record8('.');
 737         shift1();
 738         goto inNumberAfterDecimalPoint;
 739     }
 740     if ((m_current | 0x20) == 'e') {
 741         record8('0');
 742         record8('e');
 743         shift1();
 744         goto inExponentIndicator;
 745     }
 746     if (isASCIIOctalDigit(m_current))
 747         goto inOctal;
 748     if (isASCIIDigit(m_current))
 749         goto startNumber;
 750     lvalp->doubleValue = 0;
 751     goto doneNumeric;
 752
 753 inNumberAfterDecimalPoint:
 754     while (isASCIIDigit(m_current)) {
 755         record8(m_current);
 756         shift1();
 757     }
 758     if ((m_current | 0x20) == 'e') {
 759         record8('e');
 760         shift1();
 761         goto inExponentIndicator;
 762     }
 763     goto doneNumber;
 764
 765 inExponentIndicator:
 766     if (m_current == '+' || m_current == '-') {
 767         record8(m_current);
 768         shift1();
 769     }
 770     if (!isASCIIDigit(m_current))
 771         goto returnError;
 772     do {
 773         record8(m_current);
 774         shift1();
 775     } while (isASCIIDigit(m_current));
 776     goto doneNumber;
 777
 778 inOctal: {
 779     do {
 780         record8(m_current);
 781         shift1();
 782     } while (isASCIIOctalDigit(m_current));
 783     if (isASCIIDigit(m_current))
 784         goto startNumber;
 785
 786     double dval = 0;
 787
 788     const char* end = m_buffer8.end();
 789     for (const char* p = m_buffer8.data(); p < end; ++p) {
 790         dval *= 8;
 791         dval += *p - '0';
 792     }
 793     if (dval >= mantissaOverflowLowerBound)
 794         dval = parseIntOverflow(m_buffer8.data(), end - m_buffer8.data(), 8);
 795
 796     m_buffer8.resize(0);
 797
 798     lvalp->doubleValue = dval;
 799     goto doneNumeric;
 800 }
 801
 802 inHex: {
 803     do {
 804         record8(m_current);
 805         shift1();
 806     } while (isASCIIHexDigit(m_current));
 807
 808     double dval = 0;
 809
 810     const char* end = m_buffer8.end();
 811     for (const char* p = m_buffer8.data(); p < end; ++p) {
 812         dval *= 16;
 813         dval += toASCIIHexValue(*p);
 814     }
 815     if (dval >= mantissaOverflowLowerBound)
 816         dval = parseIntOverflow(m_buffer8.data(), end - m_buffer8.data(), 16);
 817
 818     m_buffer8.resize(0);
 819
 820     lvalp->doubleValue = dval;
 821     goto doneNumeric;
 822 }
 823
 824 startNumber:
 825     record8(m_current);
 826     shift1();
 827     while (isASCIIDigit(m_current)) {
 828         record8(m_current);
 829         shift1();
 830     }
 831     if (m_current == '.') {
 832         record8('.');
 833         shift1();
 834         goto inNumberAfterDecimalPoint;
 835     }
 836     if ((m_current | 0x20) == 'e') {
 837         record8('e');
 838         shift1();
 839         goto inExponentIndicator;
 840     }
 841
 842     // Fall through into doneNumber.
 843
 844 doneNumber:
 845     // Null-terminate string for strtod.
 846     m_buffer8.append('\0');
 847     lvalp->doubleValue = WTF::strtod(m_buffer8.data(), 0);
 848     m_buffer8.resize(0);
 849
 850     // Fall through into doneNumeric.
 851
 852 doneNumeric:
 853     // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
 854     if (UNLIKELY(isIdentStart(m_current)))
 855         goto returnError;
 856
 857     m_atLineStart = false;
 858     m_delimited = false;
 859     token = NUMBER;
 860     goto returnToken;
 861
 862 doneSemicolon:
 863     token = ';';
 864     m_delimited = true;
 865     goto returnToken;
 866
 867 doneIdentifier:
 868     m_atLineStart = false;
 869     m_delimited = false;
 870     lvalp->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
 871     m_buffer16.resize(0);
 872     token = IDENT;
 873     goto returnToken;
 874
 875 doneIdentifierOrKeyword: {
 876     m_atLineStart = false;
 877     m_delimited = false;
 878     m_buffer16.resize(0);
 879     const HashEntry* entry = m_keywordTable.entry(m_globalData, *lvalp->ident);
 880     token = entry ? entry->lexerValue() : IDENT;
 881     goto returnToken;
 882 }
 883
 884 doneString:
 885     // Atomize constant strings in case they're later used in property lookup.
 886     shift1();
 887     m_atLineStart = false;
 888     m_delimited = false;
 889     lvalp->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
 890     m_buffer16.resize(0);
 891     token = STRING;
 892
 893     // Fall through into returnToken.
 894
 895 returnToken: {
 896     int lineNumber = m_lineNumber;
 897     llocp->first_line = lineNumber;
 898     llocp->last_line = lineNumber;
 899     llocp->first_column = startOffset;
 900     llocp->last_column = currentOffset();
 901
 902     m_lastToken = token;
 903     return token;
 904 }
 905
 906 returnError:
 907     m_error = true;
 908     return -1;
 909 }
 910
 911 bool Lexer::scanRegExp()
 912 {
 913     ASSERT(m_buffer16.isEmpty());
 914
 915     bool lastWasEscape = false;
 916     bool inBrackets = false;
 917
 918     while (true) {
 919         if (isLineTerminator(m_current) || m_current == -1)
 920             return false;
 921         if (m_current != '/' || lastWasEscape || inBrackets) {
 922             // keep track of '[' and ']'
 923             if (!lastWasEscape) {
 924                 if (m_current == '[' && !inBrackets)
 925                     inBrackets = true;
 926                 if (m_current == ']' && inBrackets)
 927                     inBrackets = false;
 928             }
 929             record16(m_current);
 930             lastWasEscape = !lastWasEscape && m_current == '\\';
 931         } else { // end of regexp
 932             m_pattern = UString(m_buffer16);
 933             m_buffer16.resize(0);
 934             shift1();
 935             break;
 936         }
 937         shift1();
 938     }
 939
 940     while (isIdentPart(m_current)) {
 941         record16(m_current);
 942         shift1();
 943     }
 944     m_flags = UString(m_buffer16);
 945     m_buffer16.resize(0);
 946
 947     return true;
 948 }
 949
 950 void Lexer::clear()
 951 {
 952     m_identifiers.clear();
 953     m_codeWithoutBOMs.clear();
 954
 955     Vector<char> newBuffer8;
 956     newBuffer8.reserveInitialCapacity(initialReadBufferCapacity);
 957     m_buffer8.swap(newBuffer8);
 958
 959     Vector<UChar> newBuffer16;
 960     newBuffer16.reserveInitialCapacity(initialReadBufferCapacity);
 961     m_buffer16.swap(newBuffer16);
 962
 963     m_isReparsing = false;
 964
 965     m_pattern = UString();
 966     m_flags = UString();
 967 }
 968
 969 SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine)
 970 {
 971     if (m_codeWithoutBOMs.isEmpty())
 972         return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine);
 973
 974     const UChar* data = m_source->provider()->data();
 975
 976     ASSERT(openBrace < closeBrace);
 977
 978     int numBOMsBeforeOpenBrace = 0;
 979     int numBOMsBetweenBraces = 0;
 980
 981     int i;
 982     for (i = m_source->startOffset(); i < openBrace; ++i)
 983         numBOMsBeforeOpenBrace += data[i] == byteOrderMark;
 984     for (; i < closeBrace; ++i)
 985         numBOMsBetweenBraces += data[i] == byteOrderMark;
 986
 987     return SourceCode(m_source->provider(), openBrace + numBOMsBeforeOpenBrace,
 988         closeBrace + numBOMsBeforeOpenBrace + numBOMsBetweenBraces + 1, firstLine);
 989 }
 990
 991 } // namespace JSC