]> git.saurik.com Git - apple/javascriptcore.git/blob - parser/Lexer.cpp
cae6bb99daeacfce8f75fa9783c6976936b22817
[apple/javascriptcore.git] / parser / Lexer.cpp
1 /*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006, 2007, 2008, 2009 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
16 *
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
21 *
22 */
23
24 #include "config.h"
25 #include "Lexer.h"
26
27 #include "JSFunction.h"
28
29 #include "JSGlobalObjectFunctions.h"
30 #include "Identifier.h"
31 #include "NodeInfo.h"
32 #include "Nodes.h"
33 #include "dtoa.h"
34 #include <ctype.h>
35 #include <limits.h>
36 #include <string.h>
37 #include <wtf/Assertions.h>
38
39 using namespace WTF;
40 using namespace Unicode;
41
42 #include "JSParser.h"
43 #include "KeywordLookup.h"
44 #include "Lookup.h"
45 #include "Lexer.lut.h"
46
47 namespace JSC {
48
49
50 enum CharacterType {
51 // Types for the main switch
52
53 // The first three types are fixed, and also used for identifying
54 // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
55 CharacterIdentifierStart,
56 CharacterZero,
57 CharacterNumber,
58
59 CharacterInvalid,
60 CharacterLineTerminator,
61 CharacterExclamationMark,
62 CharacterOpenParen,
63 CharacterCloseParen,
64 CharacterOpenBracket,
65 CharacterCloseBracket,
66 CharacterComma,
67 CharacterColon,
68 CharacterQuestion,
69 CharacterTilde,
70 CharacterQuote,
71 CharacterDot,
72 CharacterSlash,
73 CharacterBackSlash,
74 CharacterSemicolon,
75 CharacterOpenBrace,
76 CharacterCloseBrace,
77
78 CharacterAdd,
79 CharacterSub,
80 CharacterMultiply,
81 CharacterModulo,
82 CharacterAnd,
83 CharacterXor,
84 CharacterOr,
85 CharacterLess,
86 CharacterGreater,
87 CharacterEqual,
88
89 // Other types (only one so far)
90 CharacterWhiteSpace,
91 };
92
93 // 128 ASCII codes
94 static const unsigned short typesOfASCIICharacters[128] = {
95 /* 0 - Null */ CharacterInvalid,
96 /* 1 - Start of Heading */ CharacterInvalid,
97 /* 2 - Start of Text */ CharacterInvalid,
98 /* 3 - End of Text */ CharacterInvalid,
99 /* 4 - End of Transm. */ CharacterInvalid,
100 /* 5 - Enquiry */ CharacterInvalid,
101 /* 6 - Acknowledgment */ CharacterInvalid,
102 /* 7 - Bell */ CharacterInvalid,
103 /* 8 - Back Space */ CharacterInvalid,
104 /* 9 - Horizontal Tab */ CharacterWhiteSpace,
105 /* 10 - Line Feed */ CharacterLineTerminator,
106 /* 11 - Vertical Tab */ CharacterWhiteSpace,
107 /* 12 - Form Feed */ CharacterWhiteSpace,
108 /* 13 - Carriage Return */ CharacterLineTerminator,
109 /* 14 - Shift Out */ CharacterInvalid,
110 /* 15 - Shift In */ CharacterInvalid,
111 /* 16 - Data Line Escape */ CharacterInvalid,
112 /* 17 - Device Control 1 */ CharacterInvalid,
113 /* 18 - Device Control 2 */ CharacterInvalid,
114 /* 19 - Device Control 3 */ CharacterInvalid,
115 /* 20 - Device Control 4 */ CharacterInvalid,
116 /* 21 - Negative Ack. */ CharacterInvalid,
117 /* 22 - Synchronous Idle */ CharacterInvalid,
118 /* 23 - End of Transmit */ CharacterInvalid,
119 /* 24 - Cancel */ CharacterInvalid,
120 /* 25 - End of Medium */ CharacterInvalid,
121 /* 26 - Substitute */ CharacterInvalid,
122 /* 27 - Escape */ CharacterInvalid,
123 /* 28 - File Separator */ CharacterInvalid,
124 /* 29 - Group Separator */ CharacterInvalid,
125 /* 30 - Record Separator */ CharacterInvalid,
126 /* 31 - Unit Separator */ CharacterInvalid,
127 /* 32 - Space */ CharacterWhiteSpace,
128 /* 33 - ! */ CharacterExclamationMark,
129 /* 34 - " */ CharacterQuote,
130 /* 35 - # */ CharacterInvalid,
131 /* 36 - $ */ CharacterIdentifierStart,
132 /* 37 - % */ CharacterModulo,
133 /* 38 - & */ CharacterAnd,
134 /* 39 - ' */ CharacterQuote,
135 /* 40 - ( */ CharacterOpenParen,
136 /* 41 - ) */ CharacterCloseParen,
137 /* 42 - * */ CharacterMultiply,
138 /* 43 - + */ CharacterAdd,
139 /* 44 - , */ CharacterComma,
140 /* 45 - - */ CharacterSub,
141 /* 46 - . */ CharacterDot,
142 /* 47 - / */ CharacterSlash,
143 /* 48 - 0 */ CharacterZero,
144 /* 49 - 1 */ CharacterNumber,
145 /* 50 - 2 */ CharacterNumber,
146 /* 51 - 3 */ CharacterNumber,
147 /* 52 - 4 */ CharacterNumber,
148 /* 53 - 5 */ CharacterNumber,
149 /* 54 - 6 */ CharacterNumber,
150 /* 55 - 7 */ CharacterNumber,
151 /* 56 - 8 */ CharacterNumber,
152 /* 57 - 9 */ CharacterNumber,
153 /* 58 - : */ CharacterColon,
154 /* 59 - ; */ CharacterSemicolon,
155 /* 60 - < */ CharacterLess,
156 /* 61 - = */ CharacterEqual,
157 /* 62 - > */ CharacterGreater,
158 /* 63 - ? */ CharacterQuestion,
159 /* 64 - @ */ CharacterInvalid,
160 /* 65 - A */ CharacterIdentifierStart,
161 /* 66 - B */ CharacterIdentifierStart,
162 /* 67 - C */ CharacterIdentifierStart,
163 /* 68 - D */ CharacterIdentifierStart,
164 /* 69 - E */ CharacterIdentifierStart,
165 /* 70 - F */ CharacterIdentifierStart,
166 /* 71 - G */ CharacterIdentifierStart,
167 /* 72 - H */ CharacterIdentifierStart,
168 /* 73 - I */ CharacterIdentifierStart,
169 /* 74 - J */ CharacterIdentifierStart,
170 /* 75 - K */ CharacterIdentifierStart,
171 /* 76 - L */ CharacterIdentifierStart,
172 /* 77 - M */ CharacterIdentifierStart,
173 /* 78 - N */ CharacterIdentifierStart,
174 /* 79 - O */ CharacterIdentifierStart,
175 /* 80 - P */ CharacterIdentifierStart,
176 /* 81 - Q */ CharacterIdentifierStart,
177 /* 82 - R */ CharacterIdentifierStart,
178 /* 83 - S */ CharacterIdentifierStart,
179 /* 84 - T */ CharacterIdentifierStart,
180 /* 85 - U */ CharacterIdentifierStart,
181 /* 86 - V */ CharacterIdentifierStart,
182 /* 87 - W */ CharacterIdentifierStart,
183 /* 88 - X */ CharacterIdentifierStart,
184 /* 89 - Y */ CharacterIdentifierStart,
185 /* 90 - Z */ CharacterIdentifierStart,
186 /* 91 - [ */ CharacterOpenBracket,
187 /* 92 - \ */ CharacterBackSlash,
188 /* 93 - ] */ CharacterCloseBracket,
189 /* 94 - ^ */ CharacterXor,
190 /* 95 - _ */ CharacterIdentifierStart,
191 /* 96 - ` */ CharacterInvalid,
192 /* 97 - a */ CharacterIdentifierStart,
193 /* 98 - b */ CharacterIdentifierStart,
194 /* 99 - c */ CharacterIdentifierStart,
195 /* 100 - d */ CharacterIdentifierStart,
196 /* 101 - e */ CharacterIdentifierStart,
197 /* 102 - f */ CharacterIdentifierStart,
198 /* 103 - g */ CharacterIdentifierStart,
199 /* 104 - h */ CharacterIdentifierStart,
200 /* 105 - i */ CharacterIdentifierStart,
201 /* 106 - j */ CharacterIdentifierStart,
202 /* 107 - k */ CharacterIdentifierStart,
203 /* 108 - l */ CharacterIdentifierStart,
204 /* 109 - m */ CharacterIdentifierStart,
205 /* 110 - n */ CharacterIdentifierStart,
206 /* 111 - o */ CharacterIdentifierStart,
207 /* 112 - p */ CharacterIdentifierStart,
208 /* 113 - q */ CharacterIdentifierStart,
209 /* 114 - r */ CharacterIdentifierStart,
210 /* 115 - s */ CharacterIdentifierStart,
211 /* 116 - t */ CharacterIdentifierStart,
212 /* 117 - u */ CharacterIdentifierStart,
213 /* 118 - v */ CharacterIdentifierStart,
214 /* 119 - w */ CharacterIdentifierStart,
215 /* 120 - x */ CharacterIdentifierStart,
216 /* 121 - y */ CharacterIdentifierStart,
217 /* 122 - z */ CharacterIdentifierStart,
218 /* 123 - { */ CharacterOpenBrace,
219 /* 124 - | */ CharacterOr,
220 /* 125 - } */ CharacterCloseBrace,
221 /* 126 - ~ */ CharacterTilde,
222 /* 127 - Delete */ CharacterInvalid,
223 };
224
225 Lexer::Lexer(JSGlobalData* globalData)
226 : m_isReparsing(false)
227 , m_globalData(globalData)
228 , m_keywordTable(JSC::mainTable)
229 {
230 }
231
232 Lexer::~Lexer()
233 {
234 m_keywordTable.deleteTable();
235 }
236
237 ALWAYS_INLINE const UChar* Lexer::currentCharacter() const
238 {
239 ASSERT(m_code <= m_codeEnd);
240 return m_code;
241 }
242
243 ALWAYS_INLINE int Lexer::currentOffset() const
244 {
245 return currentCharacter() - m_codeStart;
246 }
247
248 void Lexer::setCode(const SourceCode& source, ParserArena& arena)
249 {
250 m_arena = &arena.identifierArena();
251
252 m_lineNumber = source.firstLine();
253 m_delimited = false;
254 m_lastToken = -1;
255
256 const UChar* data = source.provider()->data();
257
258 m_source = &source;
259 m_codeStart = data;
260 m_code = data + source.startOffset();
261 m_codeEnd = data + source.endOffset();
262 m_error = false;
263 m_atLineStart = true;
264
265 m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
266 m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
267
268 if (LIKELY(m_code < m_codeEnd))
269 m_current = *m_code;
270 else
271 m_current = -1;
272 ASSERT(currentOffset() == source.startOffset());
273 }
274
275 template <int shiftAmount, Lexer::ShiftType shouldBoundsCheck> ALWAYS_INLINE void Lexer::internalShift()
276 {
277 if (shouldBoundsCheck == DoBoundsCheck) {
278 // Faster than an if-else sequence
279 ASSERT(m_current != -1);
280 m_current = -1;
281 m_code += shiftAmount;
282 if (LIKELY(m_code < m_codeEnd))
283 m_current = *m_code;
284 } else {
285 m_code += shiftAmount;
286 m_current = *m_code;
287 }
288 }
289
290 ALWAYS_INLINE void Lexer::shift()
291 {
292 internalShift<1, DoBoundsCheck>();
293 }
294
295 ALWAYS_INLINE int Lexer::peek(int offset)
296 {
297 // Only use if necessary
298 ASSERT(offset > 0 && offset < 5);
299 const UChar* code = m_code + offset;
300 return (code < m_codeEnd) ? *code : -1;
301 }
302
303 int Lexer::getUnicodeCharacter()
304 {
305 int char1 = peek(1);
306 int char2 = peek(2);
307 int char3 = peek(3);
308
309 if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
310 return -1;
311
312 int result = convertUnicode(m_current, char1, char2, char3);
313 shift();
314 shift();
315 shift();
316 shift();
317 return result;
318 }
319
320 void Lexer::shiftLineTerminator()
321 {
322 ASSERT(isLineTerminator(m_current));
323
324 int m_prev = m_current;
325 shift();
326
327 // Allow both CRLF and LFCR.
328 if (m_prev + m_current == '\n' + '\r')
329 shift();
330
331 ++m_lineNumber;
332 }
333
334 ALWAYS_INLINE bool Lexer::lastTokenWasRestrKeyword() const
335 {
336 return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
337 }
338
339 static NEVER_INLINE bool isNonASCIIIdentStart(int c)
340 {
341 return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other);
342 }
343
344 static inline bool isIdentStart(int c)
345 {
346 return isASCII(c) ? typesOfASCIICharacters[c] == CharacterIdentifierStart : isNonASCIIIdentStart(c);
347 }
348
349 static NEVER_INLINE bool isNonASCIIIdentPart(int c)
350 {
351 return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
352 | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector);
353 }
354
355 static ALWAYS_INLINE bool isIdentPart(int c)
356 {
357 // Character types are divided into two groups depending on whether they can be part of an
358 // identifier or not. Those whose type value is less or equal than CharacterNumber can be
359 // part of an identifier. (See the CharacterType definition for more details.)
360 return isASCII(c) ? typesOfASCIICharacters[c] <= CharacterNumber : isNonASCIIIdentPart(c);
361 }
362
363 static inline int singleEscape(int c)
364 {
365 switch (c) {
366 case 'b':
367 return 0x08;
368 case 't':
369 return 0x09;
370 case 'n':
371 return 0x0A;
372 case 'v':
373 return 0x0B;
374 case 'f':
375 return 0x0C;
376 case 'r':
377 return 0x0D;
378 case '\\':
379 return '\\';
380 case '\'':
381 return '\'';
382 case '"':
383 return '"';
384 default:
385 return 0;
386 }
387 }
388
389 inline void Lexer::record8(int c)
390 {
391 ASSERT(c >= 0);
392 ASSERT(c <= 0xFF);
393 m_buffer8.append(static_cast<char>(c));
394 }
395
396 inline void Lexer::record16(UChar c)
397 {
398 m_buffer16.append(c);
399 }
400
401 inline void Lexer::record16(int c)
402 {
403 ASSERT(c >= 0);
404 ASSERT(c <= USHRT_MAX);
405 record16(UChar(static_cast<unsigned short>(c)));
406 }
407
408 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer::parseIdentifier(JSTokenData* tokenData, unsigned lexType)
409 {
410 const ptrdiff_t remaining = m_codeEnd - m_code;
411 if ((remaining >= maxTokenLength) && !(lexType & IgnoreReservedWords)) {
412 JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
413 if (keyword != IDENT) {
414 ASSERT((!shouldCreateIdentifier) || tokenData->ident);
415 return keyword;
416 }
417 }
418 const UChar* identifierStart = currentCharacter();
419 bool bufferRequired = false;
420
421 while (true) {
422 if (LIKELY(isIdentPart(m_current))) {
423 shift();
424 continue;
425 }
426 if (LIKELY(m_current != '\\'))
427 break;
428
429 // \uXXXX unicode characters.
430 bufferRequired = true;
431 if (identifierStart != currentCharacter())
432 m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
433 shift();
434 if (UNLIKELY(m_current != 'u'))
435 return ERRORTOK;
436 shift();
437 int character = getUnicodeCharacter();
438 if (UNLIKELY(character == -1))
439 return ERRORTOK;
440 if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character) : !isIdentStart(character)))
441 return ERRORTOK;
442 if (shouldCreateIdentifier)
443 record16(character);
444 identifierStart = currentCharacter();
445 }
446
447 int identifierLength;
448 const Identifier* ident = 0;
449 if (shouldCreateIdentifier) {
450 if (!bufferRequired)
451 identifierLength = currentCharacter() - identifierStart;
452 else {
453 if (identifierStart != currentCharacter())
454 m_buffer16.append(identifierStart, currentCharacter() - identifierStart);
455 identifierStart = m_buffer16.data();
456 identifierLength = m_buffer16.size();
457 }
458
459 ident = makeIdentifier(identifierStart, identifierLength);
460 tokenData->ident = ident;
461 } else
462 tokenData->ident = 0;
463
464 m_delimited = false;
465
466 if (LIKELY(!bufferRequired && !(lexType & IgnoreReservedWords))) {
467 ASSERT(shouldCreateIdentifier);
468 // Keywords must not be recognized if there was an \uXXXX in the identifier.
469 if (remaining < maxTokenLength) {
470 const HashEntry* entry = m_keywordTable.entry(m_globalData, *ident);
471 ASSERT((remaining < maxTokenLength) || !entry);
472 return entry ? static_cast<JSTokenType>(entry->lexerValue()) : IDENT;
473 }
474 return IDENT;
475 }
476
477 m_buffer16.resize(0);
478 return IDENT;
479 }
480
481 bool Lexer::isKeyword(const Identifier& ident)
482 {
483 return m_keywordTable.entry(m_globalData, ident);
484 }
485
486 template <bool shouldBuildStrings> ALWAYS_INLINE bool Lexer::parseString(JSTokenData* tokenData, bool strictMode)
487 {
488 int stringQuoteCharacter = m_current;
489 shift();
490
491 const UChar* stringStart = currentCharacter();
492
493 while (m_current != stringQuoteCharacter) {
494 if (UNLIKELY(m_current == '\\')) {
495 if (stringStart != currentCharacter() && shouldBuildStrings)
496 m_buffer16.append(stringStart, currentCharacter() - stringStart);
497 shift();
498
499 int escape = singleEscape(m_current);
500
501 // Most common escape sequences first
502 if (escape) {
503 if (shouldBuildStrings)
504 record16(escape);
505 shift();
506 } else if (UNLIKELY(isLineTerminator(m_current)))
507 shiftLineTerminator();
508 else if (m_current == 'x') {
509 shift();
510 if (isASCIIHexDigit(m_current) && isASCIIHexDigit(peek(1))) {
511 int prev = m_current;
512 shift();
513 if (shouldBuildStrings)
514 record16(convertHex(prev, m_current));
515 shift();
516 } else if (shouldBuildStrings)
517 record16('x');
518 } else if (m_current == 'u') {
519 shift();
520 int character = getUnicodeCharacter();
521 if (character != -1) {
522 if (shouldBuildStrings)
523 record16(character);
524 } else if (m_current == stringQuoteCharacter) {
525 if (shouldBuildStrings)
526 record16('u');
527 } else // Only stringQuoteCharacter allowed after \u
528 return false;
529 } else if (strictMode && isASCIIDigit(m_current)) {
530 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
531 int character1 = m_current;
532 shift();
533 if (character1 != '0' || isASCIIDigit(m_current))
534 return false;
535 if (shouldBuildStrings)
536 record16(0);
537 } else if (!strictMode && isASCIIOctalDigit(m_current)) {
538 // Octal character sequences
539 int character1 = m_current;
540 shift();
541 if (isASCIIOctalDigit(m_current)) {
542 // Two octal characters
543 int character2 = m_current;
544 shift();
545 if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
546 if (shouldBuildStrings)
547 record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
548 shift();
549 } else {
550 if (shouldBuildStrings)
551 record16((character1 - '0') * 8 + character2 - '0');
552 }
553 } else {
554 if (shouldBuildStrings)
555 record16(character1 - '0');
556 }
557 } else if (m_current != -1) {
558 if (shouldBuildStrings)
559 record16(m_current);
560 shift();
561 } else
562 return false;
563
564 stringStart = currentCharacter();
565 continue;
566 }
567 // Fast check for characters that require special handling.
568 // Catches -1, \n, \r, 0x2028, and 0x2029 as efficiently
569 // as possible, and lets through all common ASCII characters.
570 if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
571 // New-line or end of input is not allowed
572 if (UNLIKELY(isLineTerminator(m_current)) || UNLIKELY(m_current == -1))
573 return false;
574 // Anything else is just a normal character
575 }
576 shift();
577 }
578
579 if (currentCharacter() != stringStart && shouldBuildStrings)
580 m_buffer16.append(stringStart, currentCharacter() - stringStart);
581 if (shouldBuildStrings)
582 tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
583 else
584 tokenData->ident = 0;
585
586 m_buffer16.resize(0);
587 return true;
588 }
589
590 ALWAYS_INLINE void Lexer::parseHex(double& returnValue)
591 {
592 // Optimization: most hexadecimal values fit into 4 bytes.
593 uint32_t hexValue = 0;
594 int maximumDigits = 7;
595
596 // Shift out the 'x' prefix.
597 shift();
598
599 do {
600 hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
601 shift();
602 --maximumDigits;
603 } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
604
605 if (maximumDigits >= 0) {
606 returnValue = hexValue;
607 return;
608 }
609
610 // No more place in the hexValue buffer.
611 // The values are shifted out and placed into the m_buffer8 vector.
612 for (int i = 0; i < 8; ++i) {
613 int digit = hexValue >> 28;
614 if (digit < 10)
615 record8(digit + '0');
616 else
617 record8(digit - 10 + 'a');
618 hexValue <<= 4;
619 }
620
621 while (isASCIIHexDigit(m_current)) {
622 record8(m_current);
623 shift();
624 }
625
626 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
627 }
628
629 ALWAYS_INLINE bool Lexer::parseOctal(double& returnValue)
630 {
631 // Optimization: most octal values fit into 4 bytes.
632 uint32_t octalValue = 0;
633 int maximumDigits = 9;
634 // Temporary buffer for the digits. Makes easier
635 // to reconstruct the input characters when needed.
636 char digits[10];
637
638 do {
639 octalValue = octalValue * 8 + (m_current - '0');
640 digits[maximumDigits] = m_current;
641 shift();
642 --maximumDigits;
643 } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
644
645 if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
646 returnValue = octalValue;
647 return true;
648 }
649
650 for (int i = 9; i > maximumDigits; --i)
651 record8(digits[i]);
652
653 while (isASCIIOctalDigit(m_current)) {
654 record8(m_current);
655 shift();
656 }
657
658 if (isASCIIDigit(m_current))
659 return false;
660
661 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
662 return true;
663 }
664
665 ALWAYS_INLINE bool Lexer::parseDecimal(double& returnValue)
666 {
667 // Optimization: most decimal values fit into 4 bytes.
668 uint32_t decimalValue = 0;
669
670 // Since parseOctal may be executed before parseDecimal,
671 // the m_buffer8 may hold ascii digits.
672 if (!m_buffer8.size()) {
673 int maximumDigits = 9;
674 // Temporary buffer for the digits. Makes easier
675 // to reconstruct the input characters when needed.
676 char digits[10];
677
678 do {
679 decimalValue = decimalValue * 10 + (m_current - '0');
680 digits[maximumDigits] = m_current;
681 shift();
682 --maximumDigits;
683 } while (isASCIIDigit(m_current) && maximumDigits >= 0);
684
685 if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
686 returnValue = decimalValue;
687 return true;
688 }
689
690 for (int i = 9; i > maximumDigits; --i)
691 record8(digits[i]);
692 }
693
694 while (isASCIIDigit(m_current)) {
695 record8(m_current);
696 shift();
697 }
698
699 return false;
700 }
701
702 ALWAYS_INLINE void Lexer::parseNumberAfterDecimalPoint()
703 {
704 record8('.');
705 while (isASCIIDigit(m_current)) {
706 record8(m_current);
707 shift();
708 }
709 }
710
711 ALWAYS_INLINE bool Lexer::parseNumberAfterExponentIndicator()
712 {
713 record8('e');
714 shift();
715 if (m_current == '+' || m_current == '-') {
716 record8(m_current);
717 shift();
718 }
719
720 if (!isASCIIDigit(m_current))
721 return false;
722
723 do {
724 record8(m_current);
725 shift();
726 } while (isASCIIDigit(m_current));
727 return true;
728 }
729
730 ALWAYS_INLINE bool Lexer::parseMultilineComment()
731 {
732 while (true) {
733 while (UNLIKELY(m_current == '*')) {
734 shift();
735 if (m_current == '/') {
736 shift();
737 return true;
738 }
739 }
740
741 if (UNLIKELY(m_current == -1))
742 return false;
743
744 if (isLineTerminator(m_current))
745 shiftLineTerminator();
746 else
747 shift();
748 }
749 }
750
751 bool Lexer::nextTokenIsColon()
752 {
753 const UChar* code = m_code;
754 while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
755 code++;
756
757 return code < m_codeEnd && *code == ':';
758 }
759
760 JSTokenType Lexer::lex(JSTokenData* tokenData, JSTokenInfo* tokenInfo, unsigned lexType, bool strictMode)
761 {
762 ASSERT(!m_error);
763 ASSERT(m_buffer8.isEmpty());
764 ASSERT(m_buffer16.isEmpty());
765
766 JSTokenType token = ERRORTOK;
767 m_terminator = false;
768
769 start:
770 while (isWhiteSpace(m_current))
771 shift();
772
773 int startOffset = currentOffset();
774
775 if (UNLIKELY(m_current == -1))
776 return EOFTOK;
777
778 m_delimited = false;
779
780 CharacterType type;
781 if (LIKELY(isASCII(m_current)))
782 type = static_cast<CharacterType>(typesOfASCIICharacters[m_current]);
783 else if (isNonASCIIIdentStart(m_current))
784 type = CharacterIdentifierStart;
785 else if (isLineTerminator(m_current))
786 type = CharacterLineTerminator;
787 else
788 type = CharacterInvalid;
789
790 switch (type) {
791 case CharacterGreater:
792 shift();
793 if (m_current == '>') {
794 shift();
795 if (m_current == '>') {
796 shift();
797 if (m_current == '=') {
798 shift();
799 token = URSHIFTEQUAL;
800 break;
801 }
802 token = URSHIFT;
803 break;
804 }
805 if (m_current == '=') {
806 shift();
807 token = RSHIFTEQUAL;
808 break;
809 }
810 token = RSHIFT;
811 break;
812 }
813 if (m_current == '=') {
814 shift();
815 token = GE;
816 break;
817 }
818 token = GT;
819 break;
820 case CharacterEqual:
821 shift();
822 if (m_current == '=') {
823 shift();
824 if (m_current == '=') {
825 shift();
826 token = STREQ;
827 break;
828 }
829 token = EQEQ;
830 break;
831 }
832 token = EQUAL;
833 break;
834 case CharacterLess:
835 shift();
836 if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
837 // <!-- marks the beginning of a line comment (for www usage)
838 goto inSingleLineComment;
839 }
840 if (m_current == '<') {
841 shift();
842 if (m_current == '=') {
843 shift();
844 token = LSHIFTEQUAL;
845 break;
846 }
847 token = LSHIFT;
848 break;
849 }
850 if (m_current == '=') {
851 shift();
852 token = LE;
853 break;
854 }
855 token = LT;
856 break;
857 case CharacterExclamationMark:
858 shift();
859 if (m_current == '=') {
860 shift();
861 if (m_current == '=') {
862 shift();
863 token = STRNEQ;
864 break;
865 }
866 token = NE;
867 break;
868 }
869 token = EXCLAMATION;
870 break;
871 case CharacterAdd:
872 shift();
873 if (m_current == '+') {
874 shift();
875 token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
876 break;
877 }
878 if (m_current == '=') {
879 shift();
880 token = PLUSEQUAL;
881 break;
882 }
883 token = PLUS;
884 break;
885 case CharacterSub:
886 shift();
887 if (m_current == '-') {
888 shift();
889 if (m_atLineStart && m_current == '>') {
890 shift();
891 goto inSingleLineComment;
892 }
893 token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
894 break;
895 }
896 if (m_current == '=') {
897 shift();
898 token = MINUSEQUAL;
899 break;
900 }
901 token = MINUS;
902 break;
903 case CharacterMultiply:
904 shift();
905 if (m_current == '=') {
906 shift();
907 token = MULTEQUAL;
908 break;
909 }
910 token = TIMES;
911 break;
912 case CharacterSlash:
913 shift();
914 if (m_current == '/') {
915 shift();
916 goto inSingleLineComment;
917 }
918 if (m_current == '*') {
919 shift();
920 if (parseMultilineComment())
921 goto start;
922 goto returnError;
923 }
924 if (m_current == '=') {
925 shift();
926 token = DIVEQUAL;
927 break;
928 }
929 token = DIVIDE;
930 break;
931 case CharacterAnd:
932 shift();
933 if (m_current == '&') {
934 shift();
935 token = AND;
936 break;
937 }
938 if (m_current == '=') {
939 shift();
940 token = ANDEQUAL;
941 break;
942 }
943 token = BITAND;
944 break;
945 case CharacterXor:
946 shift();
947 if (m_current == '=') {
948 shift();
949 token = XOREQUAL;
950 break;
951 }
952 token = BITXOR;
953 break;
954 case CharacterModulo:
955 shift();
956 if (m_current == '=') {
957 shift();
958 token = MODEQUAL;
959 break;
960 }
961 token = MOD;
962 break;
963 case CharacterOr:
964 shift();
965 if (m_current == '=') {
966 shift();
967 token = OREQUAL;
968 break;
969 }
970 if (m_current == '|') {
971 shift();
972 token = OR;
973 break;
974 }
975 token = BITOR;
976 break;
977 case CharacterOpenParen:
978 token = OPENPAREN;
979 shift();
980 break;
981 case CharacterCloseParen:
982 token = CLOSEPAREN;
983 shift();
984 break;
985 case CharacterOpenBracket:
986 token = OPENBRACKET;
987 shift();
988 break;
989 case CharacterCloseBracket:
990 token = CLOSEBRACKET;
991 shift();
992 break;
993 case CharacterComma:
994 token = COMMA;
995 shift();
996 break;
997 case CharacterColon:
998 token = COLON;
999 shift();
1000 break;
1001 case CharacterQuestion:
1002 token = QUESTION;
1003 shift();
1004 break;
1005 case CharacterTilde:
1006 token = TILDE;
1007 shift();
1008 break;
1009 case CharacterSemicolon:
1010 m_delimited = true;
1011 shift();
1012 token = SEMICOLON;
1013 break;
1014 case CharacterOpenBrace:
1015 tokenData->intValue = currentOffset();
1016 shift();
1017 token = OPENBRACE;
1018 break;
1019 case CharacterCloseBrace:
1020 tokenData->intValue = currentOffset();
1021 m_delimited = true;
1022 shift();
1023 token = CLOSEBRACE;
1024 break;
1025 case CharacterDot:
1026 shift();
1027 if (!isASCIIDigit(m_current)) {
1028 token = DOT;
1029 break;
1030 }
1031 goto inNumberAfterDecimalPoint;
1032 case CharacterZero:
1033 shift();
1034 if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) {
1035 parseHex(tokenData->doubleValue);
1036 token = NUMBER;
1037 } else {
1038 record8('0');
1039 if (isASCIIOctalDigit(m_current)) {
1040 if (parseOctal(tokenData->doubleValue)) {
1041 if (strictMode)
1042 goto returnError;
1043 token = NUMBER;
1044 }
1045 }
1046 }
1047 // Fall through into CharacterNumber
1048 case CharacterNumber:
1049 if (LIKELY(token != NUMBER)) {
1050 if (!parseDecimal(tokenData->doubleValue)) {
1051 if (m_current == '.') {
1052 shift();
1053 inNumberAfterDecimalPoint:
1054 parseNumberAfterDecimalPoint();
1055 }
1056 if ((m_current | 0x20) == 'e')
1057 if (!parseNumberAfterExponentIndicator())
1058 goto returnError;
1059 // Null-terminate string for strtod.
1060 m_buffer8.append('\0');
1061 tokenData->doubleValue = WTF::strtod(m_buffer8.data(), 0);
1062 }
1063 token = NUMBER;
1064 }
1065
1066 // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
1067 if (UNLIKELY(isIdentStart(m_current)))
1068 goto returnError;
1069 m_buffer8.resize(0);
1070 m_delimited = false;
1071 break;
1072 case CharacterQuote:
1073 if (lexType & DontBuildStrings) {
1074 if (UNLIKELY(!parseString<false>(tokenData, strictMode)))
1075 goto returnError;
1076 } else {
1077 if (UNLIKELY(!parseString<true>(tokenData, strictMode)))
1078 goto returnError;
1079 }
1080 shift();
1081 m_delimited = false;
1082 token = STRING;
1083 break;
1084 case CharacterIdentifierStart:
1085 ASSERT(isIdentStart(m_current));
1086 // Fall through into CharacterBackSlash.
1087 case CharacterBackSlash:
1088 if (lexType & DontBuildKeywords)
1089 token = parseIdentifier<false>(tokenData, lexType);
1090 else
1091 token = parseIdentifier<true>(tokenData, lexType);
1092 break;
1093 case CharacterLineTerminator:
1094 ASSERT(isLineTerminator(m_current));
1095 shiftLineTerminator();
1096 m_atLineStart = true;
1097 m_terminator = true;
1098 goto start;
1099 case CharacterInvalid:
1100 goto returnError;
1101 default:
1102 ASSERT_NOT_REACHED();
1103 goto returnError;
1104 }
1105
1106 m_atLineStart = false;
1107 goto returnToken;
1108
1109 inSingleLineComment:
1110 while (!isLineTerminator(m_current)) {
1111 if (UNLIKELY(m_current == -1))
1112 return EOFTOK;
1113 shift();
1114 }
1115 shiftLineTerminator();
1116 m_atLineStart = true;
1117 m_terminator = true;
1118 if (!lastTokenWasRestrKeyword())
1119 goto start;
1120
1121 token = SEMICOLON;
1122 m_delimited = true;
1123 // Fall through into returnToken.
1124
1125 returnToken:
1126 tokenInfo->line = m_lineNumber;
1127 tokenInfo->startOffset = startOffset;
1128 tokenInfo->endOffset = currentOffset();
1129 m_lastToken = token;
1130 return token;
1131
1132 returnError:
1133 m_error = true;
1134 return ERRORTOK;
1135 }
1136
1137 bool Lexer::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
1138 {
1139 ASSERT(m_buffer16.isEmpty());
1140
1141 bool lastWasEscape = false;
1142 bool inBrackets = false;
1143
1144 if (patternPrefix) {
1145 ASSERT(!isLineTerminator(patternPrefix));
1146 ASSERT(patternPrefix != '/');
1147 ASSERT(patternPrefix != '[');
1148 record16(patternPrefix);
1149 }
1150
1151 while (true) {
1152 int current = m_current;
1153
1154 if (isLineTerminator(current) || current == -1) {
1155 m_buffer16.resize(0);
1156 return false;
1157 }
1158
1159 shift();
1160
1161 if (current == '/' && !lastWasEscape && !inBrackets)
1162 break;
1163
1164 record16(current);
1165
1166 if (lastWasEscape) {
1167 lastWasEscape = false;
1168 continue;
1169 }
1170
1171 switch (current) {
1172 case '[':
1173 inBrackets = true;
1174 break;
1175 case ']':
1176 inBrackets = false;
1177 break;
1178 case '\\':
1179 lastWasEscape = true;
1180 break;
1181 }
1182 }
1183
1184 pattern = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1185 m_buffer16.resize(0);
1186
1187 while (isIdentPart(m_current)) {
1188 record16(m_current);
1189 shift();
1190 }
1191
1192 flags = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1193 m_buffer16.resize(0);
1194
1195 return true;
1196 }
1197
1198 bool Lexer::skipRegExp()
1199 {
1200 bool lastWasEscape = false;
1201 bool inBrackets = false;
1202
1203 while (true) {
1204 int current = m_current;
1205
1206 if (isLineTerminator(current) || current == -1)
1207 return false;
1208
1209 shift();
1210
1211 if (current == '/' && !lastWasEscape && !inBrackets)
1212 break;
1213
1214 if (lastWasEscape) {
1215 lastWasEscape = false;
1216 continue;
1217 }
1218
1219 switch (current) {
1220 case '[':
1221 inBrackets = true;
1222 break;
1223 case ']':
1224 inBrackets = false;
1225 break;
1226 case '\\':
1227 lastWasEscape = true;
1228 break;
1229 }
1230 }
1231
1232 while (isIdentPart(m_current))
1233 shift();
1234
1235 return true;
1236 }
1237
1238 void Lexer::clear()
1239 {
1240 m_arena = 0;
1241
1242 Vector<char> newBuffer8;
1243 m_buffer8.swap(newBuffer8);
1244
1245 Vector<UChar> newBuffer16;
1246 m_buffer16.swap(newBuffer16);
1247
1248 m_isReparsing = false;
1249 }
1250
1251 SourceCode Lexer::sourceCode(int openBrace, int closeBrace, int firstLine)
1252 {
1253 ASSERT(m_source->provider()->data()[openBrace] == '{');
1254 ASSERT(m_source->provider()->data()[closeBrace] == '}');
1255 return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine);
1256 }
1257
1258 } // namespace JSC