parser/Lexer.cpp

   1 /*
   2  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
   3  *  Copyright (C) 2006, 2007, 2008, 2009, 2011, 2012 Apple Inc. All Rights Reserved.
   4  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
   5  *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
   6  *  Copyright (C) 2012 Mathias Bynens (mathias@qiwi.be)
   7  *
   8  *  This library is free software; you can redistribute it and/or
   9  *  modify it under the terms of the GNU Library General Public
  10  *  License as published by the Free Software Foundation; either
  11  *  version 2 of the License, or (at your option) any later version.
  12  *
  13  *  This library is distributed in the hope that it will be useful,
  14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  *  Library General Public License for more details.
  17  *
  18  *  You should have received a copy of the GNU Library General Public License
  19  *  along with this library; see the file COPYING.LIB.  If not, write to
  20  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  21  *  Boston, MA 02110-1301, USA.
  22  *
  23  */
  24
  25 #include "config.h"
  26 #include "Lexer.h"
  27
  28 #include "JSFunction.h"
  29
  30 #include "JSGlobalObjectFunctions.h"
  31 #include "Identifier.h"
  32 #include "NodeInfo.h"
  33 #include "Nodes.h"
  34 #include <wtf/dtoa.h>
  35 #include <ctype.h>
  36 #include <limits.h>
  37 #include <string.h>
  38 #include <wtf/Assertions.h>
  39
  40 using namespace WTF;
  41 using namespace Unicode;
  42
  43 #include "KeywordLookup.h"
  44 #include "Lexer.lut.h"
  45 #include "Parser.h"
  46
  47 namespace JSC {
  48
  49 Keywords::Keywords(JSGlobalData* globalData)
  50     : m_globalData(globalData)
  51     , m_keywordTable(JSC::mainTable)
  52 {
  53 }
  54
  55 enum CharacterType {
  56     // Types for the main switch
  57
  58     // The first three types are fixed, and also used for identifying
  59     // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
  60     CharacterIdentifierStart,
  61     CharacterZero,
  62     CharacterNumber,
  63
  64     CharacterInvalid,
  65     CharacterLineTerminator,
  66     CharacterExclamationMark,
  67     CharacterOpenParen,
  68     CharacterCloseParen,
  69     CharacterOpenBracket,
  70     CharacterCloseBracket,
  71     CharacterComma,
  72     CharacterColon,
  73     CharacterQuestion,
  74     CharacterTilde,
  75     CharacterQuote,
  76     CharacterDot,
  77     CharacterSlash,
  78     CharacterBackSlash,
  79     CharacterSemicolon,
  80     CharacterOpenBrace,
  81     CharacterCloseBrace,
  82
  83     CharacterAdd,
  84     CharacterSub,
  85     CharacterMultiply,
  86     CharacterModulo,
  87     CharacterAnd,
  88     CharacterXor,
  89     CharacterOr,
  90     CharacterLess,
  91     CharacterGreater,
  92     CharacterEqual,
  93
  94     // Other types (only one so far)
  95     CharacterWhiteSpace,
  96 };
  97
  98 // 256 Latin-1 codes
  99 static const unsigned short typesOfLatin1Characters[256] = {
 100 /*   0 - Null               */ CharacterInvalid,
 101 /*   1 - Start of Heading   */ CharacterInvalid,
 102 /*   2 - Start of Text      */ CharacterInvalid,
 103 /*   3 - End of Text        */ CharacterInvalid,
 104 /*   4 - End of Transm.     */ CharacterInvalid,
 105 /*   5 - Enquiry            */ CharacterInvalid,
 106 /*   6 - Acknowledgment     */ CharacterInvalid,
 107 /*   7 - Bell               */ CharacterInvalid,
 108 /*   8 - Back Space         */ CharacterInvalid,
 109 /*   9 - Horizontal Tab     */ CharacterWhiteSpace,
 110 /*  10 - Line Feed          */ CharacterLineTerminator,
 111 /*  11 - Vertical Tab       */ CharacterWhiteSpace,
 112 /*  12 - Form Feed          */ CharacterWhiteSpace,
 113 /*  13 - Carriage Return    */ CharacterLineTerminator,
 114 /*  14 - Shift Out          */ CharacterInvalid,
 115 /*  15 - Shift In           */ CharacterInvalid,
 116 /*  16 - Data Line Escape   */ CharacterInvalid,
 117 /*  17 - Device Control 1   */ CharacterInvalid,
 118 /*  18 - Device Control 2   */ CharacterInvalid,
 119 /*  19 - Device Control 3   */ CharacterInvalid,
 120 /*  20 - Device Control 4   */ CharacterInvalid,
 121 /*  21 - Negative Ack.      */ CharacterInvalid,
 122 /*  22 - Synchronous Idle   */ CharacterInvalid,
 123 /*  23 - End of Transmit    */ CharacterInvalid,
 124 /*  24 - Cancel             */ CharacterInvalid,
 125 /*  25 - End of Medium      */ CharacterInvalid,
 126 /*  26 - Substitute         */ CharacterInvalid,
 127 /*  27 - Escape             */ CharacterInvalid,
 128 /*  28 - File Separator     */ CharacterInvalid,
 129 /*  29 - Group Separator    */ CharacterInvalid,
 130 /*  30 - Record Separator   */ CharacterInvalid,
 131 /*  31 - Unit Separator     */ CharacterInvalid,
 132 /*  32 - Space              */ CharacterWhiteSpace,
 133 /*  33 - !                  */ CharacterExclamationMark,
 134 /*  34 - "                  */ CharacterQuote,
 135 /*  35 - #                  */ CharacterInvalid,
 136 /*  36 - $                  */ CharacterIdentifierStart,
 137 /*  37 - %                  */ CharacterModulo,
 138 /*  38 - &                  */ CharacterAnd,
 139 /*  39 - '                  */ CharacterQuote,
 140 /*  40 - (                  */ CharacterOpenParen,
 141 /*  41 - )                  */ CharacterCloseParen,
 142 /*  42 - *                  */ CharacterMultiply,
 143 /*  43 - +                  */ CharacterAdd,
 144 /*  44 - ,                  */ CharacterComma,
 145 /*  45 - -                  */ CharacterSub,
 146 /*  46 - .                  */ CharacterDot,
 147 /*  47 - /                  */ CharacterSlash,
 148 /*  48 - 0                  */ CharacterZero,
 149 /*  49 - 1                  */ CharacterNumber,
 150 /*  50 - 2                  */ CharacterNumber,
 151 /*  51 - 3                  */ CharacterNumber,
 152 /*  52 - 4                  */ CharacterNumber,
 153 /*  53 - 5                  */ CharacterNumber,
 154 /*  54 - 6                  */ CharacterNumber,
 155 /*  55 - 7                  */ CharacterNumber,
 156 /*  56 - 8                  */ CharacterNumber,
 157 /*  57 - 9                  */ CharacterNumber,
 158 /*  58 - :                  */ CharacterColon,
 159 /*  59 - ;                  */ CharacterSemicolon,
 160 /*  60 - <                  */ CharacterLess,
 161 /*  61 - =                  */ CharacterEqual,
 162 /*  62 - >                  */ CharacterGreater,
 163 /*  63 - ?                  */ CharacterQuestion,
 164 /*  64 - @                  */ CharacterInvalid,
 165 /*  65 - A                  */ CharacterIdentifierStart,
 166 /*  66 - B                  */ CharacterIdentifierStart,
 167 /*  67 - C                  */ CharacterIdentifierStart,
 168 /*  68 - D                  */ CharacterIdentifierStart,
 169 /*  69 - E                  */ CharacterIdentifierStart,
 170 /*  70 - F                  */ CharacterIdentifierStart,
 171 /*  71 - G                  */ CharacterIdentifierStart,
 172 /*  72 - H                  */ CharacterIdentifierStart,
 173 /*  73 - I                  */ CharacterIdentifierStart,
 174 /*  74 - J                  */ CharacterIdentifierStart,
 175 /*  75 - K                  */ CharacterIdentifierStart,
 176 /*  76 - L                  */ CharacterIdentifierStart,
 177 /*  77 - M                  */ CharacterIdentifierStart,
 178 /*  78 - N                  */ CharacterIdentifierStart,
 179 /*  79 - O                  */ CharacterIdentifierStart,
 180 /*  80 - P                  */ CharacterIdentifierStart,
 181 /*  81 - Q                  */ CharacterIdentifierStart,
 182 /*  82 - R                  */ CharacterIdentifierStart,
 183 /*  83 - S                  */ CharacterIdentifierStart,
 184 /*  84 - T                  */ CharacterIdentifierStart,
 185 /*  85 - U                  */ CharacterIdentifierStart,
 186 /*  86 - V                  */ CharacterIdentifierStart,
 187 /*  87 - W                  */ CharacterIdentifierStart,
 188 /*  88 - X                  */ CharacterIdentifierStart,
 189 /*  89 - Y                  */ CharacterIdentifierStart,
 190 /*  90 - Z                  */ CharacterIdentifierStart,
 191 /*  91 - [                  */ CharacterOpenBracket,
 192 /*  92 - \                  */ CharacterBackSlash,
 193 /*  93 - ]                  */ CharacterCloseBracket,
 194 /*  94 - ^                  */ CharacterXor,
 195 /*  95 - _                  */ CharacterIdentifierStart,
 196 /*  96 - `                  */ CharacterInvalid,
 197 /*  97 - a                  */ CharacterIdentifierStart,
 198 /*  98 - b                  */ CharacterIdentifierStart,
 199 /*  99 - c                  */ CharacterIdentifierStart,
 200 /* 100 - d                  */ CharacterIdentifierStart,
 201 /* 101 - e                  */ CharacterIdentifierStart,
 202 /* 102 - f                  */ CharacterIdentifierStart,
 203 /* 103 - g                  */ CharacterIdentifierStart,
 204 /* 104 - h                  */ CharacterIdentifierStart,
 205 /* 105 - i                  */ CharacterIdentifierStart,
 206 /* 106 - j                  */ CharacterIdentifierStart,
 207 /* 107 - k                  */ CharacterIdentifierStart,
 208 /* 108 - l                  */ CharacterIdentifierStart,
 209 /* 109 - m                  */ CharacterIdentifierStart,
 210 /* 110 - n                  */ CharacterIdentifierStart,
 211 /* 111 - o                  */ CharacterIdentifierStart,
 212 /* 112 - p                  */ CharacterIdentifierStart,
 213 /* 113 - q                  */ CharacterIdentifierStart,
 214 /* 114 - r                  */ CharacterIdentifierStart,
 215 /* 115 - s                  */ CharacterIdentifierStart,
 216 /* 116 - t                  */ CharacterIdentifierStart,
 217 /* 117 - u                  */ CharacterIdentifierStart,
 218 /* 118 - v                  */ CharacterIdentifierStart,
 219 /* 119 - w                  */ CharacterIdentifierStart,
 220 /* 120 - x                  */ CharacterIdentifierStart,
 221 /* 121 - y                  */ CharacterIdentifierStart,
 222 /* 122 - z                  */ CharacterIdentifierStart,
 223 /* 123 - {                  */ CharacterOpenBrace,
 224 /* 124 - |                  */ CharacterOr,
 225 /* 125 - }                  */ CharacterCloseBrace,
 226 /* 126 - ~                  */ CharacterTilde,
 227 /* 127 - Delete             */ CharacterInvalid,
 228 /* 128 - Cc category        */ CharacterInvalid,
 229 /* 129 - Cc category        */ CharacterInvalid,
 230 /* 130 - Cc category        */ CharacterInvalid,
 231 /* 131 - Cc category        */ CharacterInvalid,
 232 /* 132 - Cc category        */ CharacterInvalid,
 233 /* 133 - Cc category        */ CharacterInvalid,
 234 /* 134 - Cc category        */ CharacterInvalid,
 235 /* 135 - Cc category        */ CharacterInvalid,
 236 /* 136 - Cc category        */ CharacterInvalid,
 237 /* 137 - Cc category        */ CharacterInvalid,
 238 /* 138 - Cc category        */ CharacterInvalid,
 239 /* 139 - Cc category        */ CharacterInvalid,
 240 /* 140 - Cc category        */ CharacterInvalid,
 241 /* 141 - Cc category        */ CharacterInvalid,
 242 /* 142 - Cc category        */ CharacterInvalid,
 243 /* 143 - Cc category        */ CharacterInvalid,
 244 /* 144 - Cc category        */ CharacterInvalid,
 245 /* 145 - Cc category        */ CharacterInvalid,
 246 /* 146 - Cc category        */ CharacterInvalid,
 247 /* 147 - Cc category        */ CharacterInvalid,
 248 /* 148 - Cc category        */ CharacterInvalid,
 249 /* 149 - Cc category        */ CharacterInvalid,
 250 /* 150 - Cc category        */ CharacterInvalid,
 251 /* 151 - Cc category        */ CharacterInvalid,
 252 /* 152 - Cc category        */ CharacterInvalid,
 253 /* 153 - Cc category        */ CharacterInvalid,
 254 /* 154 - Cc category        */ CharacterInvalid,
 255 /* 155 - Cc category        */ CharacterInvalid,
 256 /* 156 - Cc category        */ CharacterInvalid,
 257 /* 157 - Cc category        */ CharacterInvalid,
 258 /* 158 - Cc category        */ CharacterInvalid,
 259 /* 159 - Cc category        */ CharacterInvalid,
 260 /* 160 - Zs category (nbsp) */ CharacterWhiteSpace,
 261 /* 161 - Po category        */ CharacterInvalid,
 262 /* 162 - Sc category        */ CharacterInvalid,
 263 /* 163 - Sc category        */ CharacterInvalid,
 264 /* 164 - Sc category        */ CharacterInvalid,
 265 /* 165 - Sc category        */ CharacterInvalid,
 266 /* 166 - So category        */ CharacterInvalid,
 267 /* 167 - So category        */ CharacterInvalid,
 268 /* 168 - Sk category        */ CharacterInvalid,
 269 /* 169 - So category        */ CharacterInvalid,
 270 /* 170 - Ll category        */ CharacterIdentifierStart,
 271 /* 171 - Pi category        */ CharacterInvalid,
 272 /* 172 - Sm category        */ CharacterInvalid,
 273 /* 173 - Cf category        */ CharacterInvalid,
 274 /* 174 - So category        */ CharacterInvalid,
 275 /* 175 - Sk category        */ CharacterInvalid,
 276 /* 176 - So category        */ CharacterInvalid,
 277 /* 177 - Sm category        */ CharacterInvalid,
 278 /* 178 - No category        */ CharacterInvalid,
 279 /* 179 - No category        */ CharacterInvalid,
 280 /* 180 - Sk category        */ CharacterInvalid,
 281 /* 181 - Ll category        */ CharacterIdentifierStart,
 282 /* 182 - So category        */ CharacterInvalid,
 283 /* 183 - Po category        */ CharacterInvalid,
 284 /* 184 - Sk category        */ CharacterInvalid,
 285 /* 185 - No category        */ CharacterInvalid,
 286 /* 186 - Ll category        */ CharacterIdentifierStart,
 287 /* 187 - Pf category        */ CharacterInvalid,
 288 /* 188 - No category        */ CharacterInvalid,
 289 /* 189 - No category        */ CharacterInvalid,
 290 /* 190 - No category        */ CharacterInvalid,
 291 /* 191 - Po category        */ CharacterInvalid,
 292 /* 192 - Lu category        */ CharacterIdentifierStart,
 293 /* 193 - Lu category        */ CharacterIdentifierStart,
 294 /* 194 - Lu category        */ CharacterIdentifierStart,
 295 /* 195 - Lu category        */ CharacterIdentifierStart,
 296 /* 196 - Lu category        */ CharacterIdentifierStart,
 297 /* 197 - Lu category        */ CharacterIdentifierStart,
 298 /* 198 - Lu category        */ CharacterIdentifierStart,
 299 /* 199 - Lu category        */ CharacterIdentifierStart,
 300 /* 200 - Lu category        */ CharacterIdentifierStart,
 301 /* 201 - Lu category        */ CharacterIdentifierStart,
 302 /* 202 - Lu category        */ CharacterIdentifierStart,
 303 /* 203 - Lu category        */ CharacterIdentifierStart,
 304 /* 204 - Lu category        */ CharacterIdentifierStart,
 305 /* 205 - Lu category        */ CharacterIdentifierStart,
 306 /* 206 - Lu category        */ CharacterIdentifierStart,
 307 /* 207 - Lu category        */ CharacterIdentifierStart,
 308 /* 208 - Lu category        */ CharacterIdentifierStart,
 309 /* 209 - Lu category        */ CharacterIdentifierStart,
 310 /* 210 - Lu category        */ CharacterIdentifierStart,
 311 /* 211 - Lu category        */ CharacterIdentifierStart,
 312 /* 212 - Lu category        */ CharacterIdentifierStart,
 313 /* 213 - Lu category        */ CharacterIdentifierStart,
 314 /* 214 - Lu category        */ CharacterIdentifierStart,
 315 /* 215 - Sm category        */ CharacterInvalid,
 316 /* 216 - Lu category        */ CharacterIdentifierStart,
 317 /* 217 - Lu category        */ CharacterIdentifierStart,
 318 /* 218 - Lu category        */ CharacterIdentifierStart,
 319 /* 219 - Lu category        */ CharacterIdentifierStart,
 320 /* 220 - Lu category        */ CharacterIdentifierStart,
 321 /* 221 - Lu category        */ CharacterIdentifierStart,
 322 /* 222 - Lu category        */ CharacterIdentifierStart,
 323 /* 223 - Ll category        */ CharacterIdentifierStart,
 324 /* 224 - Ll category        */ CharacterIdentifierStart,
 325 /* 225 - Ll category        */ CharacterIdentifierStart,
 326 /* 226 - Ll category        */ CharacterIdentifierStart,
 327 /* 227 - Ll category        */ CharacterIdentifierStart,
 328 /* 228 - Ll category        */ CharacterIdentifierStart,
 329 /* 229 - Ll category        */ CharacterIdentifierStart,
 330 /* 230 - Ll category        */ CharacterIdentifierStart,
 331 /* 231 - Ll category        */ CharacterIdentifierStart,
 332 /* 232 - Ll category        */ CharacterIdentifierStart,
 333 /* 233 - Ll category        */ CharacterIdentifierStart,
 334 /* 234 - Ll category        */ CharacterIdentifierStart,
 335 /* 235 - Ll category        */ CharacterIdentifierStart,
 336 /* 236 - Ll category        */ CharacterIdentifierStart,
 337 /* 237 - Ll category        */ CharacterIdentifierStart,
 338 /* 238 - Ll category        */ CharacterIdentifierStart,
 339 /* 239 - Ll category        */ CharacterIdentifierStart,
 340 /* 240 - Ll category        */ CharacterIdentifierStart,
 341 /* 241 - Ll category        */ CharacterIdentifierStart,
 342 /* 242 - Ll category        */ CharacterIdentifierStart,
 343 /* 243 - Ll category        */ CharacterIdentifierStart,
 344 /* 244 - Ll category        */ CharacterIdentifierStart,
 345 /* 245 - Ll category        */ CharacterIdentifierStart,
 346 /* 246 - Ll category        */ CharacterIdentifierStart,
 347 /* 247 - Sm category        */ CharacterInvalid,
 348 /* 248 - Ll category        */ CharacterIdentifierStart,
 349 /* 249 - Ll category        */ CharacterIdentifierStart,
 350 /* 250 - Ll category        */ CharacterIdentifierStart,
 351 /* 251 - Ll category        */ CharacterIdentifierStart,
 352 /* 252 - Ll category        */ CharacterIdentifierStart,
 353 /* 253 - Ll category        */ CharacterIdentifierStart,
 354 /* 254 - Ll category        */ CharacterIdentifierStart,
 355 /* 255 - Ll category        */ CharacterIdentifierStart
 356 };
 357
 358 template <typename T>
 359 Lexer<T>::Lexer(JSGlobalData* globalData)
 360     : m_isReparsing(false)
 361     , m_globalData(globalData)
 362 {
 363 }
 364
 365 template <typename T>
 366 Lexer<T>::~Lexer()
 367 {
 368 }
 369
 370 template <typename T>
 371 UString Lexer<T>::invalidCharacterMessage() const
 372 {
 373     switch (m_current) {
 374     case 0:
 375         return "Invalid character: '\\0'";
 376     case 10:
 377         return "Invalid character: '\\n'";
 378     case 11:
 379         return "Invalid character: '\\v'";
 380     case 13:
 381         return "Invalid character: '\\r'";
 382     case 35:
 383         return "Invalid character: '#'";
 384     case 64:
 385         return "Invalid character: '@'";
 386     case 96:
 387         return "Invalid character: '`'";
 388     default:
 389         return String::format("Invalid character '\\u%04u'", static_cast<unsigned>(m_current)).impl();
 390     }
 391 }
 392
 393 template <typename T>
 394 ALWAYS_INLINE const T* Lexer<T>::currentCharacter() const
 395 {
 396     ASSERT(m_code <= m_codeEnd);
 397     return m_code;
 398 }
 399
 400 template <typename T>
 401 void Lexer<T>::setCode(const SourceCode& source, ParserArena* arena)
 402 {
 403     m_arena = &arena->identifierArena();
 404
 405     m_lineNumber = source.firstLine();
 406     m_lastToken = -1;
 407
 408     const StringImpl* sourceString = source.provider()->data();
 409
 410     if (sourceString)
 411         setCodeStart(sourceString);
 412     else
 413         m_codeStart = 0;
 414
 415     m_source = &source;
 416     m_code = m_codeStart + source.startOffset();
 417     m_codeEnd = m_codeStart + source.endOffset();
 418     m_error = false;
 419     m_atLineStart = true;
 420     m_lexErrorMessage = UString();
 421
 422     m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
 423     m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
 424
 425     if (LIKELY(m_code < m_codeEnd))
 426         m_current = *m_code;
 427     else
 428         m_current = 0;
 429     ASSERT(currentOffset() == source.startOffset());
 430 }
 431
 432 template <typename T>
 433 template <int shiftAmount> ALWAYS_INLINE void Lexer<T>::internalShift()
 434 {
 435     m_code += shiftAmount;
 436     m_current = *m_code;
 437 }
 438
 439 template <typename T>
 440 ALWAYS_INLINE void Lexer<T>::shift()
 441 {
 442     // At one point timing showed that setting m_current to 0 unconditionally was faster than an if-else sequence.
 443     m_current = 0;
 444     ++m_code;
 445     if (LIKELY(m_code < m_codeEnd))
 446         m_current = *m_code;
 447 }
 448
 449 template <typename T>
 450 ALWAYS_INLINE bool Lexer<T>::atEnd() const
 451 {
 452     ASSERT(!m_current || m_code < m_codeEnd);
 453     return UNLIKELY(UNLIKELY(!m_current) && m_code == m_codeEnd);
 454 }
 455
 456 template <typename T>
 457 ALWAYS_INLINE T Lexer<T>::peek(int offset) const
 458 {
 459     ASSERT(offset > 0 && offset < 5);
 460     const T* code = m_code + offset;
 461     return (code < m_codeEnd) ? *code : 0;
 462 }
 463
 464 template <typename T>
 465 int Lexer<T>::parseFourDigitUnicodeHex()
 466 {
 467     T char1 = peek(1);
 468     T char2 = peek(2);
 469     T char3 = peek(3);
 470
 471     if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
 472         return -1;
 473
 474     int result = convertUnicode(m_current, char1, char2, char3);
 475     shift();
 476     shift();
 477     shift();
 478     shift();
 479     return result;
 480 }
 481
 482 template <typename T>
 483 void Lexer<T>::shiftLineTerminator()
 484 {
 485     ASSERT(isLineTerminator(m_current));
 486
 487     T prev = m_current;
 488     shift();
 489
 490     // Allow both CRLF and LFCR.
 491     if (prev + m_current == '\n' + '\r')
 492         shift();
 493
 494     ++m_lineNumber;
 495 }
 496
 497 template <typename T>
 498 ALWAYS_INLINE bool Lexer<T>::lastTokenWasRestrKeyword() const
 499 {
 500     return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
 501 }
 502
 503 static NEVER_INLINE bool isNonLatin1IdentStart(int c)
 504 {
 505     return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other);
 506 }
 507
 508 static ALWAYS_INLINE bool isLatin1(LChar)
 509 {
 510     return true;
 511 }
 512
 513 static ALWAYS_INLINE bool isLatin1(UChar c)
 514 {
 515     return c < 256;
 516 }
 517
 518 static inline bool isIdentStart(LChar c)
 519 {
 520     return typesOfLatin1Characters[c] == CharacterIdentifierStart;
 521 }
 522
 523 static inline bool isIdentStart(UChar c)
 524 {
 525     return isLatin1(c) ? isIdentStart(static_cast<LChar>(c)) : isNonLatin1IdentStart(c);
 526 }
 527
 528 static NEVER_INLINE bool isNonLatin1IdentPart(int c)
 529 {
 530     return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
 531         | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector)) || c == 0x200C || c == 0x200D;
 532 }
 533
 534 static ALWAYS_INLINE bool isIdentPart(LChar c)
 535 {
 536     // Character types are divided into two groups depending on whether they can be part of an
 537     // identifier or not. Those whose type value is less or equal than CharacterNumber can be
 538     // part of an identifier. (See the CharacterType definition for more details.)
 539     return typesOfLatin1Characters[c] <= CharacterNumber;
 540 }
 541
 542 static ALWAYS_INLINE bool isIdentPart(UChar c)
 543 {
 544     return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
 545 }
 546
 547 static inline int singleEscape(int c)
 548 {
 549     switch (c) {
 550     case 'b':
 551         return 0x08;
 552     case 't':
 553         return 0x09;
 554     case 'n':
 555         return 0x0A;
 556     case 'v':
 557         return 0x0B;
 558     case 'f':
 559         return 0x0C;
 560     case 'r':
 561         return 0x0D;
 562     case '\\':
 563         return '\\';
 564     case '\'':
 565         return '\'';
 566     case '"':
 567         return '"';
 568     default:
 569         return 0;
 570     }
 571 }
 572
 573 template <typename T>
 574 inline void Lexer<T>::record8(int c)
 575 {
 576     ASSERT(c >= 0);
 577     ASSERT(c <= 0xFF);
 578     m_buffer8.append(static_cast<LChar>(c));
 579 }
 580
 581 template <typename T>
 582 inline void assertCharIsIn8BitRange(T c)
 583 {
 584     UNUSED_PARAM(c);
 585     ASSERT(c >= 0);
 586     ASSERT(c <= 0xFF);
 587 }
 588
 589 template <>
 590 inline void assertCharIsIn8BitRange(UChar c)
 591 {
 592     UNUSED_PARAM(c);
 593     ASSERT(c <= 0xFF);
 594 }
 595
 596 template <>
 597 inline void assertCharIsIn8BitRange(LChar)
 598 {
 599 }
 600
 601 template <typename T>
 602 inline void Lexer<T>::append8(const T* p, size_t length)
 603 {
 604     size_t currentSize = m_buffer8.size();
 605     m_buffer8.grow(currentSize + length);
 606     LChar* rawBuffer = m_buffer8.data() + currentSize;
 607
 608     for (size_t i = 0; i < length; i++) {
 609         T c = p[i];
 610         assertCharIsIn8BitRange(c);
 611         rawBuffer[i] = c;
 612     }
 613 }
 614
 615 template <typename T>
 616 inline void Lexer<T>::append16(const LChar* p, size_t length)
 617 {
 618     size_t currentSize = m_buffer16.size();
 619     m_buffer16.grow(currentSize + length);
 620     UChar* rawBuffer = m_buffer16.data() + currentSize;
 621
 622     for (size_t i = 0; i < length; i++)
 623         rawBuffer[i] = p[i];
 624 }
 625
 626 template <typename T>
 627 inline void Lexer<T>::record16(T c)
 628 {
 629     m_buffer16.append(c);
 630 }
 631
 632 template <typename T>
 633 inline void Lexer<T>::record16(int c)
 634 {
 635     ASSERT(c >= 0);
 636     ASSERT(c <= static_cast<int>(USHRT_MAX));
 637     m_buffer16.append(static_cast<UChar>(c));
 638 }
 639
 640 template <>
 641 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<LChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
 642 {
 643     const ptrdiff_t remaining = m_codeEnd - m_code;
 644     if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
 645         JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
 646         if (keyword != IDENT) {
 647             ASSERT((!shouldCreateIdentifier) || tokenData->ident);
 648             return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
 649         }
 650     }
 651
 652     const LChar* identifierStart = currentCharacter();
 653
 654     while (isIdentPart(m_current))
 655         shift();
 656
 657     if (UNLIKELY(m_current == '\\')) {
 658         setOffsetFromCharOffset(identifierStart);
 659         return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
 660     }
 661
 662     const Identifier* ident = 0;
 663
 664     if (shouldCreateIdentifier) {
 665         int identifierLength = currentCharacter() - identifierStart;
 666         ident = makeIdentifier(identifierStart, identifierLength);
 667
 668         tokenData->ident = ident;
 669     } else
 670         tokenData->ident = 0;
 671
 672     if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
 673         ASSERT(shouldCreateIdentifier);
 674         if (remaining < maxTokenLength) {
 675             const HashEntry* entry = m_globalData->keywords->getKeyword(*ident);
 676             ASSERT((remaining < maxTokenLength) || !entry);
 677             if (!entry)
 678                 return IDENT;
 679             JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
 680             return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
 681         }
 682         return IDENT;
 683     }
 684
 685     return IDENT;
 686 }
 687
 688 template <>
 689 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<UChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
 690 {
 691     const ptrdiff_t remaining = m_codeEnd - m_code;
 692     if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
 693         JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
 694         if (keyword != IDENT) {
 695             ASSERT((!shouldCreateIdentifier) || tokenData->ident);
 696             return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
 697         }
 698     }
 699
 700     const UChar* identifierStart = currentCharacter();
 701
 702     UChar orAllChars = 0;
 703
 704     while (isIdentPart(m_current)) {
 705         orAllChars |= m_current;
 706         shift();
 707     }
 708
 709     if (UNLIKELY(m_current == '\\')) {
 710         setOffsetFromCharOffset(identifierStart);
 711         return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
 712     }
 713
 714     bool isAll8Bit = false;
 715
 716     if (!(orAllChars & ~0xff))
 717         isAll8Bit = true;
 718
 719     const Identifier* ident = 0;
 720
 721     if (shouldCreateIdentifier) {
 722         int identifierLength = currentCharacter() - identifierStart;
 723         if (isAll8Bit)
 724             ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
 725         else
 726             ident = makeIdentifier(identifierStart, identifierLength);
 727
 728         tokenData->ident = ident;
 729     } else
 730         tokenData->ident = 0;
 731
 732     if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
 733         ASSERT(shouldCreateIdentifier);
 734         if (remaining < maxTokenLength) {
 735             const HashEntry* entry = m_globalData->keywords->getKeyword(*ident);
 736             ASSERT((remaining < maxTokenLength) || !entry);
 737             if (!entry)
 738                 return IDENT;
 739             JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
 740             return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
 741         }
 742         return IDENT;
 743     }
 744
 745     return IDENT;
 746 }
 747
 748 template <typename T>
 749 template <bool shouldCreateIdentifier> JSTokenType Lexer<T>::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
 750 {
 751     const ptrdiff_t remaining = m_codeEnd - m_code;
 752     const T* identifierStart = currentCharacter();
 753     bool bufferRequired = false;
 754
 755     while (true) {
 756         if (LIKELY(isIdentPart(m_current))) {
 757             shift();
 758             continue;
 759         }
 760         if (LIKELY(m_current != '\\'))
 761             break;
 762
 763         // \uXXXX unicode characters.
 764         bufferRequired = true;
 765         if (identifierStart != currentCharacter())
 766             m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
 767         shift();
 768         if (UNLIKELY(m_current != 'u'))
 769             return ERRORTOK;
 770         shift();
 771         int character = parseFourDigitUnicodeHex();
 772         if (UNLIKELY(character == -1))
 773             return ERRORTOK;
 774         UChar ucharacter = static_cast<UChar>(character);
 775         if (UNLIKELY(m_buffer16.size() ? !isIdentPart(ucharacter) : !isIdentStart(ucharacter)))
 776             return ERRORTOK;
 777         if (shouldCreateIdentifier)
 778             record16(ucharacter);
 779         identifierStart = currentCharacter();
 780     }
 781
 782     int identifierLength;
 783     const Identifier* ident = 0;
 784     if (shouldCreateIdentifier) {
 785         if (!bufferRequired) {
 786             identifierLength = currentCharacter() - identifierStart;
 787             ident = makeIdentifier(identifierStart, identifierLength);
 788         } else {
 789             if (identifierStart != currentCharacter())
 790                 m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
 791             ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
 792         }
 793
 794         tokenData->ident = ident;
 795     } else
 796         tokenData->ident = 0;
 797
 798     if (LIKELY(!bufferRequired && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
 799         ASSERT(shouldCreateIdentifier);
 800         // Keywords must not be recognized if there was an \uXXXX in the identifier.
 801         if (remaining < maxTokenLength) {
 802             const HashEntry* entry = m_globalData->keywords->getKeyword(*ident);
 803             ASSERT((remaining < maxTokenLength) || !entry);
 804             if (!entry)
 805                 return IDENT;
 806             JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
 807             return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
 808         }
 809         return IDENT;
 810     }
 811
 812     m_buffer16.resize(0);
 813     return IDENT;
 814 }
 815
 816 static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(LChar character)
 817 {
 818     return character < 0xE;
 819 }
 820
 821 static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(UChar character)
 822 {
 823     return character < 0xE || character > 0xFF;
 824 }
 825
 826 template <typename T>
 827 template <bool shouldBuildStrings> ALWAYS_INLINE bool Lexer<T>::parseString(JSTokenData* tokenData, bool strictMode)
 828 {
 829     int startingOffset = currentOffset();
 830     int startingLineNumber = lineNumber();
 831     T stringQuoteCharacter = m_current;
 832     shift();
 833
 834     const T* stringStart = currentCharacter();
 835
 836     while (m_current != stringQuoteCharacter) {
 837         if (UNLIKELY(m_current == '\\')) {
 838             if (stringStart != currentCharacter() && shouldBuildStrings)
 839                 append8(stringStart, currentCharacter() - stringStart);
 840             shift();
 841
 842             int escape = singleEscape(m_current);
 843
 844             // Most common escape sequences first
 845             if (escape) {
 846                 if (shouldBuildStrings)
 847                     record8(escape);
 848                 shift();
 849             } else if (UNLIKELY(isLineTerminator(m_current)))
 850                 shiftLineTerminator();
 851             else if (m_current == 'x') {
 852                 shift();
 853                 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
 854                     m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
 855                     return false;
 856                 }
 857                 T prev = m_current;
 858                 shift();
 859                 if (shouldBuildStrings)
 860                     record8(convertHex(prev, m_current));
 861                 shift();
 862             } else {
 863                 setOffset(startingOffset);
 864                 setLineNumber(startingLineNumber);
 865                 m_buffer8.resize(0);
 866                 return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
 867             }
 868             stringStart = currentCharacter();
 869             continue;
 870         }
 871
 872         if (UNLIKELY(characterRequiresParseStringSlowCase(m_current))) {
 873             setOffset(startingOffset);
 874             setLineNumber(startingLineNumber);
 875             m_buffer8.resize(0);
 876             return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
 877         }
 878
 879         shift();
 880     }
 881
 882     if (currentCharacter() != stringStart && shouldBuildStrings)
 883         append8(stringStart, currentCharacter() - stringStart);
 884     if (shouldBuildStrings) {
 885         tokenData->ident = makeIdentifier(m_buffer8.data(), m_buffer8.size());
 886         m_buffer8.resize(0);
 887     } else
 888         tokenData->ident = 0;
 889
 890     return true;
 891 }
 892
 893 template <typename T>
 894 template <bool shouldBuildStrings> bool Lexer<T>::parseStringSlowCase(JSTokenData* tokenData, bool strictMode)
 895 {
 896     T stringQuoteCharacter = m_current;
 897     shift();
 898
 899     const T* stringStart = currentCharacter();
 900
 901     while (m_current != stringQuoteCharacter) {
 902         if (UNLIKELY(m_current == '\\')) {
 903             if (stringStart != currentCharacter() && shouldBuildStrings)
 904                 append16(stringStart, currentCharacter() - stringStart);
 905             shift();
 906
 907             int escape = singleEscape(m_current);
 908
 909             // Most common escape sequences first
 910             if (escape) {
 911                 if (shouldBuildStrings)
 912                     record16(escape);
 913                 shift();
 914             } else if (UNLIKELY(isLineTerminator(m_current)))
 915                 shiftLineTerminator();
 916             else if (m_current == 'x') {
 917                 shift();
 918                 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
 919                     m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
 920                     return false;
 921                 }
 922                 T prev = m_current;
 923                 shift();
 924                 if (shouldBuildStrings)
 925                     record16(convertHex(prev, m_current));
 926                 shift();
 927             } else if (m_current == 'u') {
 928                 shift();
 929                 int character = parseFourDigitUnicodeHex();
 930                 if (character != -1) {
 931                     if (shouldBuildStrings)
 932                         record16(character);
 933                 } else if (m_current == stringQuoteCharacter) {
 934                     if (shouldBuildStrings)
 935                         record16('u');
 936                 } else {
 937                     m_lexErrorMessage = "\\u can only be followed by a Unicode character sequence";
 938                     return false;
 939                 }
 940             } else if (strictMode && isASCIIDigit(m_current)) {
 941                 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
 942                 int character1 = m_current;
 943                 shift();
 944                 if (character1 != '0' || isASCIIDigit(m_current)) {
 945                     m_lexErrorMessage = "The only valid numeric escape in strict mode is '\\0'";
 946                     return false;
 947                 }
 948                 if (shouldBuildStrings)
 949                     record16(0);
 950             } else if (!strictMode && isASCIIOctalDigit(m_current)) {
 951                 // Octal character sequences
 952                 T character1 = m_current;
 953                 shift();
 954                 if (isASCIIOctalDigit(m_current)) {
 955                     // Two octal characters
 956                     T character2 = m_current;
 957                     shift();
 958                     if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
 959                         if (shouldBuildStrings)
 960                             record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
 961                         shift();
 962                     } else {
 963                         if (shouldBuildStrings)
 964                             record16((character1 - '0') * 8 + character2 - '0');
 965                     }
 966                 } else {
 967                     if (shouldBuildStrings)
 968                         record16(character1 - '0');
 969                 }
 970             } else if (!atEnd()) {
 971                 if (shouldBuildStrings)
 972                     record16(m_current);
 973                 shift();
 974             } else {
 975                 m_lexErrorMessage = "Unterminated string constant";
 976                 return false;
 977             }
 978
 979             stringStart = currentCharacter();
 980             continue;
 981         }
 982         // Fast check for characters that require special handling.
 983         // Catches 0, \n, \r, 0x2028, and 0x2029 as efficiently
 984         // as possible, and lets through all common ASCII characters.
 985         if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
 986             // New-line or end of input is not allowed
 987             if (atEnd() || isLineTerminator(m_current)) {
 988                 m_lexErrorMessage = "Unexpected EOF";
 989                 return false;
 990             }
 991             // Anything else is just a normal character
 992         }
 993         shift();
 994     }
 995
 996     if (currentCharacter() != stringStart && shouldBuildStrings)
 997         append16(stringStart, currentCharacter() - stringStart);
 998     if (shouldBuildStrings)
 999         tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1000     else
1001         tokenData->ident = 0;
1002
1003     m_buffer16.resize(0);
1004     return true;
1005 }
1006
1007 template <typename T>
1008 ALWAYS_INLINE void Lexer<T>::parseHex(double& returnValue)
1009 {
1010     // Optimization: most hexadecimal values fit into 4 bytes.
1011     uint32_t hexValue = 0;
1012     int maximumDigits = 7;
1013
1014     // Shift out the 'x' prefix.
1015     shift();
1016
1017     do {
1018         hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
1019         shift();
1020         --maximumDigits;
1021     } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
1022
1023     if (maximumDigits >= 0) {
1024         returnValue = hexValue;
1025         return;
1026     }
1027
1028     // No more place in the hexValue buffer.
1029     // The values are shifted out and placed into the m_buffer8 vector.
1030     for (int i = 0; i < 8; ++i) {
1031          int digit = hexValue >> 28;
1032          if (digit < 10)
1033              record8(digit + '0');
1034          else
1035              record8(digit - 10 + 'a');
1036          hexValue <<= 4;
1037     }
1038
1039     while (isASCIIHexDigit(m_current)) {
1040         record8(m_current);
1041         shift();
1042     }
1043
1044     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
1045 }
1046
1047 template <typename T>
1048 ALWAYS_INLINE bool Lexer<T>::parseOctal(double& returnValue)
1049 {
1050     // Optimization: most octal values fit into 4 bytes.
1051     uint32_t octalValue = 0;
1052     int maximumDigits = 9;
1053     // Temporary buffer for the digits. Makes easier
1054     // to reconstruct the input characters when needed.
1055     LChar digits[10];
1056
1057     do {
1058         octalValue = octalValue * 8 + (m_current - '0');
1059         digits[maximumDigits] = m_current;
1060         shift();
1061         --maximumDigits;
1062     } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
1063
1064     if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
1065         returnValue = octalValue;
1066         return true;
1067     }
1068
1069     for (int i = 9; i > maximumDigits; --i)
1070          record8(digits[i]);
1071
1072     while (isASCIIOctalDigit(m_current)) {
1073         record8(m_current);
1074         shift();
1075     }
1076
1077     if (isASCIIDigit(m_current))
1078         return false;
1079
1080     returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
1081     return true;
1082 }
1083
1084 template <typename T>
1085 ALWAYS_INLINE bool Lexer<T>::parseDecimal(double& returnValue)
1086 {
1087     // Optimization: most decimal values fit into 4 bytes.
1088     uint32_t decimalValue = 0;
1089
1090     // Since parseOctal may be executed before parseDecimal,
1091     // the m_buffer8 may hold ascii digits.
1092     if (!m_buffer8.size()) {
1093         int maximumDigits = 9;
1094         // Temporary buffer for the digits. Makes easier
1095         // to reconstruct the input characters when needed.
1096         LChar digits[10];
1097
1098         do {
1099             decimalValue = decimalValue * 10 + (m_current - '0');
1100             digits[maximumDigits] = m_current;
1101             shift();
1102             --maximumDigits;
1103         } while (isASCIIDigit(m_current) && maximumDigits >= 0);
1104
1105         if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
1106             returnValue = decimalValue;
1107             return true;
1108         }
1109
1110         for (int i = 9; i > maximumDigits; --i)
1111             record8(digits[i]);
1112     }
1113
1114     while (isASCIIDigit(m_current)) {
1115         record8(m_current);
1116         shift();
1117     }
1118
1119     return false;
1120 }
1121
1122 template <typename T>
1123 ALWAYS_INLINE void Lexer<T>::parseNumberAfterDecimalPoint()
1124 {
1125     record8('.');
1126     while (isASCIIDigit(m_current)) {
1127         record8(m_current);
1128         shift();
1129     }
1130 }
1131
1132 template <typename T>
1133 ALWAYS_INLINE bool Lexer<T>::parseNumberAfterExponentIndicator()
1134 {
1135     record8('e');
1136     shift();
1137     if (m_current == '+' || m_current == '-') {
1138         record8(m_current);
1139         shift();
1140     }
1141
1142     if (!isASCIIDigit(m_current))
1143         return false;
1144
1145     do {
1146         record8(m_current);
1147         shift();
1148     } while (isASCIIDigit(m_current));
1149     return true;
1150 }
1151
1152 template <typename T>
1153 ALWAYS_INLINE bool Lexer<T>::parseMultilineComment()
1154 {
1155     while (true) {
1156         while (UNLIKELY(m_current == '*')) {
1157             shift();
1158             if (m_current == '/') {
1159                 shift();
1160                 return true;
1161             }
1162         }
1163
1164         if (atEnd())
1165             return false;
1166
1167         if (isLineTerminator(m_current)) {
1168             shiftLineTerminator();
1169             m_terminator = true;
1170         } else
1171             shift();
1172     }
1173 }
1174
1175 template <typename T>
1176 bool Lexer<T>::nextTokenIsColon()
1177 {
1178     const T* code = m_code;
1179     while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
1180         code++;
1181
1182     return code < m_codeEnd && *code == ':';
1183 }
1184
1185 template <typename T>
1186 JSTokenType Lexer<T>::lex(JSTokenData* tokenData, JSTokenInfo* tokenInfo, unsigned lexerFlags, bool strictMode)
1187 {
1188     ASSERT(!m_error);
1189     ASSERT(m_buffer8.isEmpty());
1190     ASSERT(m_buffer16.isEmpty());
1191
1192     JSTokenType token = ERRORTOK;
1193     m_terminator = false;
1194
1195 start:
1196     while (isWhiteSpace(m_current))
1197         shift();
1198
1199     if (atEnd())
1200         return EOFTOK;
1201
1202     tokenInfo->startOffset = currentOffset();
1203
1204     CharacterType type;
1205     if (LIKELY(isLatin1(m_current)))
1206         type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
1207     else if (isNonLatin1IdentStart(m_current))
1208         type = CharacterIdentifierStart;
1209     else if (isLineTerminator(m_current))
1210         type = CharacterLineTerminator;
1211     else
1212         type = CharacterInvalid;
1213
1214     switch (type) {
1215     case CharacterGreater:
1216         shift();
1217         if (m_current == '>') {
1218             shift();
1219             if (m_current == '>') {
1220                 shift();
1221                 if (m_current == '=') {
1222                     shift();
1223                     token = URSHIFTEQUAL;
1224                     break;
1225                 }
1226                 token = URSHIFT;
1227                 break;
1228             }
1229             if (m_current == '=') {
1230                 shift();
1231                 token = RSHIFTEQUAL;
1232                 break;
1233             }
1234             token = RSHIFT;
1235             break;
1236         }
1237         if (m_current == '=') {
1238             shift();
1239             token = GE;
1240             break;
1241         }
1242         token = GT;
1243         break;
1244     case CharacterEqual:
1245         shift();
1246         if (m_current == '=') {
1247             shift();
1248             if (m_current == '=') {
1249                 shift();
1250                 token = STREQ;
1251                 break;
1252             }
1253             token = EQEQ;
1254             break;
1255         }
1256         token = EQUAL;
1257         break;
1258     case CharacterLess:
1259         shift();
1260         if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
1261             // <!-- marks the beginning of a line comment (for www usage)
1262             goto inSingleLineComment;
1263         }
1264         if (m_current == '<') {
1265             shift();
1266             if (m_current == '=') {
1267                 shift();
1268                 token = LSHIFTEQUAL;
1269                 break;
1270             }
1271             token = LSHIFT;
1272             break;
1273         }
1274         if (m_current == '=') {
1275             shift();
1276             token = LE;
1277             break;
1278         }
1279         token = LT;
1280         break;
1281     case CharacterExclamationMark:
1282         shift();
1283         if (m_current == '=') {
1284             shift();
1285             if (m_current == '=') {
1286                 shift();
1287                 token = STRNEQ;
1288                 break;
1289             }
1290             token = NE;
1291             break;
1292         }
1293         token = EXCLAMATION;
1294         break;
1295     case CharacterAdd:
1296         shift();
1297         if (m_current == '+') {
1298             shift();
1299             token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
1300             break;
1301         }
1302         if (m_current == '=') {
1303             shift();
1304             token = PLUSEQUAL;
1305             break;
1306         }
1307         token = PLUS;
1308         break;
1309     case CharacterSub:
1310         shift();
1311         if (m_current == '-') {
1312             shift();
1313             if (m_atLineStart && m_current == '>') {
1314                 shift();
1315                 goto inSingleLineComment;
1316             }
1317             token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
1318             break;
1319         }
1320         if (m_current == '=') {
1321             shift();
1322             token = MINUSEQUAL;
1323             break;
1324         }
1325         token = MINUS;
1326         break;
1327     case CharacterMultiply:
1328         shift();
1329         if (m_current == '=') {
1330             shift();
1331             token = MULTEQUAL;
1332             break;
1333         }
1334         token = TIMES;
1335         break;
1336     case CharacterSlash:
1337         shift();
1338         if (m_current == '/') {
1339             shift();
1340             goto inSingleLineComment;
1341         }
1342         if (m_current == '*') {
1343             shift();
1344             if (parseMultilineComment())
1345                 goto start;
1346             m_lexErrorMessage = "Multiline comment was not closed properly";
1347             goto returnError;
1348         }
1349         if (m_current == '=') {
1350             shift();
1351             token = DIVEQUAL;
1352             break;
1353         }
1354         token = DIVIDE;
1355         break;
1356     case CharacterAnd:
1357         shift();
1358         if (m_current == '&') {
1359             shift();
1360             token = AND;
1361             break;
1362         }
1363         if (m_current == '=') {
1364             shift();
1365             token = ANDEQUAL;
1366             break;
1367         }
1368         token = BITAND;
1369         break;
1370     case CharacterXor:
1371         shift();
1372         if (m_current == '=') {
1373             shift();
1374             token = XOREQUAL;
1375             break;
1376         }
1377         token = BITXOR;
1378         break;
1379     case CharacterModulo:
1380         shift();
1381         if (m_current == '=') {
1382             shift();
1383             token = MODEQUAL;
1384             break;
1385         }
1386         token = MOD;
1387         break;
1388     case CharacterOr:
1389         shift();
1390         if (m_current == '=') {
1391             shift();
1392             token = OREQUAL;
1393             break;
1394         }
1395         if (m_current == '|') {
1396             shift();
1397             token = OR;
1398             break;
1399         }
1400         token = BITOR;
1401         break;
1402     case CharacterOpenParen:
1403         token = OPENPAREN;
1404         shift();
1405         break;
1406     case CharacterCloseParen:
1407         token = CLOSEPAREN;
1408         shift();
1409         break;
1410     case CharacterOpenBracket:
1411         token = OPENBRACKET;
1412         shift();
1413         break;
1414     case CharacterCloseBracket:
1415         token = CLOSEBRACKET;
1416         shift();
1417         break;
1418     case CharacterComma:
1419         token = COMMA;
1420         shift();
1421         break;
1422     case CharacterColon:
1423         token = COLON;
1424         shift();
1425         break;
1426     case CharacterQuestion:
1427         token = QUESTION;
1428         shift();
1429         break;
1430     case CharacterTilde:
1431         token = TILDE;
1432         shift();
1433         break;
1434     case CharacterSemicolon:
1435         shift();
1436         token = SEMICOLON;
1437         break;
1438     case CharacterOpenBrace:
1439         tokenData->intValue = currentOffset();
1440         shift();
1441         token = OPENBRACE;
1442         break;
1443     case CharacterCloseBrace:
1444         tokenData->intValue = currentOffset();
1445         shift();
1446         token = CLOSEBRACE;
1447         break;
1448     case CharacterDot:
1449         shift();
1450         if (!isASCIIDigit(m_current)) {
1451             token = DOT;
1452             break;
1453         }
1454         goto inNumberAfterDecimalPoint;
1455     case CharacterZero:
1456         shift();
1457         if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) {
1458             parseHex(tokenData->doubleValue);
1459             token = NUMBER;
1460         } else {
1461             record8('0');
1462             if (isASCIIOctalDigit(m_current)) {
1463                 if (parseOctal(tokenData->doubleValue)) {
1464                     if (strictMode) {
1465                         m_lexErrorMessage = "Octal escapes are forbidden in strict mode";
1466                         goto returnError;
1467                     }
1468                     token = NUMBER;
1469                 }
1470             }
1471         }
1472         // Fall through into CharacterNumber
1473     case CharacterNumber:
1474         if (LIKELY(token != NUMBER)) {
1475             if (!parseDecimal(tokenData->doubleValue)) {
1476                 if (m_current == '.') {
1477                     shift();
1478 inNumberAfterDecimalPoint:
1479                     parseNumberAfterDecimalPoint();
1480                 }
1481                 if ((m_current | 0x20) == 'e') {
1482                     if (!parseNumberAfterExponentIndicator()) {
1483                         m_lexErrorMessage = "Non-number found after exponent indicator";
1484                         goto returnError;
1485                     }
1486                 }
1487                 size_t parsedLength;
1488                 tokenData->doubleValue = parseDouble(m_buffer8.data(), m_buffer8.size(), parsedLength);
1489             }
1490             token = NUMBER;
1491         }
1492
1493         // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
1494         if (UNLIKELY(isIdentStart(m_current))) {
1495             m_lexErrorMessage = "At least one digit must occur after a decimal point";
1496             goto returnError;
1497         }
1498         m_buffer8.resize(0);
1499         break;
1500     case CharacterQuote:
1501         if (lexerFlags & LexerFlagsDontBuildStrings) {
1502             if (UNLIKELY(!parseString<false>(tokenData, strictMode)))
1503                 goto returnError;
1504         } else {
1505             if (UNLIKELY(!parseString<true>(tokenData, strictMode)))
1506                 goto returnError;
1507         }
1508         shift();
1509         token = STRING;
1510         break;
1511     case CharacterIdentifierStart:
1512         ASSERT(isIdentStart(m_current));
1513         // Fall through into CharacterBackSlash.
1514     case CharacterBackSlash:
1515         if (lexerFlags & LexexFlagsDontBuildKeywords)
1516             token = parseIdentifier<false>(tokenData, lexerFlags, strictMode);
1517         else
1518             token = parseIdentifier<true>(tokenData, lexerFlags, strictMode);
1519         break;
1520     case CharacterLineTerminator:
1521         ASSERT(isLineTerminator(m_current));
1522         shiftLineTerminator();
1523         m_atLineStart = true;
1524         m_terminator = true;
1525         goto start;
1526     case CharacterInvalid:
1527         m_lexErrorMessage = invalidCharacterMessage();
1528         goto returnError;
1529     default:
1530         ASSERT_NOT_REACHED();
1531         m_lexErrorMessage = "Internal Error";
1532         goto returnError;
1533     }
1534
1535     m_atLineStart = false;
1536     goto returnToken;
1537
1538 inSingleLineComment:
1539     while (!isLineTerminator(m_current)) {
1540         if (atEnd())
1541             return EOFTOK;
1542         shift();
1543     }
1544     shiftLineTerminator();
1545     m_atLineStart = true;
1546     m_terminator = true;
1547     if (!lastTokenWasRestrKeyword())
1548         goto start;
1549
1550     token = SEMICOLON;
1551     // Fall through into returnToken.
1552
1553 returnToken:
1554     tokenInfo->line = m_lineNumber;
1555     tokenInfo->endOffset = currentOffset();
1556     m_lastToken = token;
1557     return token;
1558
1559 returnError:
1560     m_error = true;
1561     tokenInfo->line = m_lineNumber;
1562     tokenInfo->endOffset = currentOffset();
1563     return ERRORTOK;
1564 }
1565
1566 template <typename T>
1567 bool Lexer<T>::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
1568 {
1569     ASSERT(m_buffer16.isEmpty());
1570
1571     bool lastWasEscape = false;
1572     bool inBrackets = false;
1573
1574     if (patternPrefix) {
1575         ASSERT(!isLineTerminator(patternPrefix));
1576         ASSERT(patternPrefix != '/');
1577         ASSERT(patternPrefix != '[');
1578         record16(patternPrefix);
1579     }
1580
1581     while (true) {
1582         if (isLineTerminator(m_current) || atEnd()) {
1583             m_buffer16.resize(0);
1584             return false;
1585         }
1586
1587         T prev = m_current;
1588
1589         shift();
1590
1591         if (prev == '/' && !lastWasEscape && !inBrackets)
1592             break;
1593
1594         record16(prev);
1595
1596         if (lastWasEscape) {
1597             lastWasEscape = false;
1598             continue;
1599         }
1600
1601         switch (prev) {
1602         case '[':
1603             inBrackets = true;
1604             break;
1605         case ']':
1606             inBrackets = false;
1607             break;
1608         case '\\':
1609             lastWasEscape = true;
1610             break;
1611         }
1612     }
1613
1614     pattern = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1615     m_buffer16.resize(0);
1616
1617     while (isIdentPart(m_current)) {
1618         record16(m_current);
1619         shift();
1620     }
1621
1622     flags = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1623     m_buffer16.resize(0);
1624
1625     return true;
1626 }
1627
1628 template <typename T>
1629 bool Lexer<T>::skipRegExp()
1630 {
1631     bool lastWasEscape = false;
1632     bool inBrackets = false;
1633
1634     while (true) {
1635         if (isLineTerminator(m_current) || atEnd())
1636             return false;
1637
1638         T prev = m_current;
1639
1640         shift();
1641
1642         if (prev == '/' && !lastWasEscape && !inBrackets)
1643             break;
1644
1645         if (lastWasEscape) {
1646             lastWasEscape = false;
1647             continue;
1648         }
1649
1650         switch (prev) {
1651         case '[':
1652             inBrackets = true;
1653             break;
1654         case ']':
1655             inBrackets = false;
1656             break;
1657         case '\\':
1658             lastWasEscape = true;
1659             break;
1660         }
1661     }
1662
1663     while (isIdentPart(m_current))
1664         shift();
1665
1666     return true;
1667 }
1668
1669 template <typename T>
1670 void Lexer<T>::clear()
1671 {
1672     m_arena = 0;
1673
1674     Vector<LChar> newBuffer8;
1675     m_buffer8.swap(newBuffer8);
1676
1677     Vector<UChar> newBuffer16;
1678     m_buffer16.swap(newBuffer16);
1679
1680     m_isReparsing = false;
1681 }
1682
1683 template <typename T>
1684 SourceCode Lexer<T>::sourceCode(int openBrace, int closeBrace, int firstLine)
1685 {
1686     ASSERT((*m_source->provider()->data())[openBrace] == '{');
1687     ASSERT((*m_source->provider()->data())[closeBrace] == '}');
1688     return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine);
1689 }
1690
1691 // Instantiate the two flavors of Lexer we need instead of putting most of this file in Lexer.h
1692 template class Lexer<LChar>;
1693 template class Lexer<UChar>;
1694
1695 } // namespace JSC