]> git.saurik.com Git - apple/javascriptcore.git/blob - parser/Lexer.cpp
JavaScriptCore-1218.33.tar.gz
[apple/javascriptcore.git] / parser / Lexer.cpp
1 /*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
6 * Copyright (C) 2012 Mathias Bynens (mathias@qiwi.be)
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22 *
23 */
24
25 #include "config.h"
26 #include "Lexer.h"
27
28 #include "JSFunction.h"
29
30 #include "JSGlobalObjectFunctions.h"
31 #include "Identifier.h"
32 #include "NodeInfo.h"
33 #include "Nodes.h"
34 #include <wtf/dtoa.h>
35 #include <ctype.h>
36 #include <limits.h>
37 #include <string.h>
38 #include <wtf/Assertions.h>
39
40 using namespace WTF;
41 using namespace Unicode;
42
43 #include "KeywordLookup.h"
44 #include "Lexer.lut.h"
45 #include "Parser.h"
46
47 namespace JSC {
48
49 Keywords::Keywords(VM* vm)
50 : m_vm(vm)
51 , m_keywordTable(JSC::mainTable)
52 {
53 }
54
55 enum CharacterType {
56 // Types for the main switch
57
58 // The first three types are fixed, and also used for identifying
59 // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
60 CharacterIdentifierStart,
61 CharacterZero,
62 CharacterNumber,
63
64 CharacterInvalid,
65 CharacterLineTerminator,
66 CharacterExclamationMark,
67 CharacterOpenParen,
68 CharacterCloseParen,
69 CharacterOpenBracket,
70 CharacterCloseBracket,
71 CharacterComma,
72 CharacterColon,
73 CharacterQuestion,
74 CharacterTilde,
75 CharacterQuote,
76 CharacterDot,
77 CharacterSlash,
78 CharacterBackSlash,
79 CharacterSemicolon,
80 CharacterOpenBrace,
81 CharacterCloseBrace,
82
83 CharacterAdd,
84 CharacterSub,
85 CharacterMultiply,
86 CharacterModulo,
87 CharacterAnd,
88 CharacterXor,
89 CharacterOr,
90 CharacterLess,
91 CharacterGreater,
92 CharacterEqual,
93
94 // Other types (only one so far)
95 CharacterWhiteSpace,
96 };
97
98 // 256 Latin-1 codes
99 static const unsigned short typesOfLatin1Characters[256] = {
100 /* 0 - Null */ CharacterInvalid,
101 /* 1 - Start of Heading */ CharacterInvalid,
102 /* 2 - Start of Text */ CharacterInvalid,
103 /* 3 - End of Text */ CharacterInvalid,
104 /* 4 - End of Transm. */ CharacterInvalid,
105 /* 5 - Enquiry */ CharacterInvalid,
106 /* 6 - Acknowledgment */ CharacterInvalid,
107 /* 7 - Bell */ CharacterInvalid,
108 /* 8 - Back Space */ CharacterInvalid,
109 /* 9 - Horizontal Tab */ CharacterWhiteSpace,
110 /* 10 - Line Feed */ CharacterLineTerminator,
111 /* 11 - Vertical Tab */ CharacterWhiteSpace,
112 /* 12 - Form Feed */ CharacterWhiteSpace,
113 /* 13 - Carriage Return */ CharacterLineTerminator,
114 /* 14 - Shift Out */ CharacterInvalid,
115 /* 15 - Shift In */ CharacterInvalid,
116 /* 16 - Data Line Escape */ CharacterInvalid,
117 /* 17 - Device Control 1 */ CharacterInvalid,
118 /* 18 - Device Control 2 */ CharacterInvalid,
119 /* 19 - Device Control 3 */ CharacterInvalid,
120 /* 20 - Device Control 4 */ CharacterInvalid,
121 /* 21 - Negative Ack. */ CharacterInvalid,
122 /* 22 - Synchronous Idle */ CharacterInvalid,
123 /* 23 - End of Transmit */ CharacterInvalid,
124 /* 24 - Cancel */ CharacterInvalid,
125 /* 25 - End of Medium */ CharacterInvalid,
126 /* 26 - Substitute */ CharacterInvalid,
127 /* 27 - Escape */ CharacterInvalid,
128 /* 28 - File Separator */ CharacterInvalid,
129 /* 29 - Group Separator */ CharacterInvalid,
130 /* 30 - Record Separator */ CharacterInvalid,
131 /* 31 - Unit Separator */ CharacterInvalid,
132 /* 32 - Space */ CharacterWhiteSpace,
133 /* 33 - ! */ CharacterExclamationMark,
134 /* 34 - " */ CharacterQuote,
135 /* 35 - # */ CharacterInvalid,
136 /* 36 - $ */ CharacterIdentifierStart,
137 /* 37 - % */ CharacterModulo,
138 /* 38 - & */ CharacterAnd,
139 /* 39 - ' */ CharacterQuote,
140 /* 40 - ( */ CharacterOpenParen,
141 /* 41 - ) */ CharacterCloseParen,
142 /* 42 - * */ CharacterMultiply,
143 /* 43 - + */ CharacterAdd,
144 /* 44 - , */ CharacterComma,
145 /* 45 - - */ CharacterSub,
146 /* 46 - . */ CharacterDot,
147 /* 47 - / */ CharacterSlash,
148 /* 48 - 0 */ CharacterZero,
149 /* 49 - 1 */ CharacterNumber,
150 /* 50 - 2 */ CharacterNumber,
151 /* 51 - 3 */ CharacterNumber,
152 /* 52 - 4 */ CharacterNumber,
153 /* 53 - 5 */ CharacterNumber,
154 /* 54 - 6 */ CharacterNumber,
155 /* 55 - 7 */ CharacterNumber,
156 /* 56 - 8 */ CharacterNumber,
157 /* 57 - 9 */ CharacterNumber,
158 /* 58 - : */ CharacterColon,
159 /* 59 - ; */ CharacterSemicolon,
160 /* 60 - < */ CharacterLess,
161 /* 61 - = */ CharacterEqual,
162 /* 62 - > */ CharacterGreater,
163 /* 63 - ? */ CharacterQuestion,
164 /* 64 - @ */ CharacterInvalid,
165 /* 65 - A */ CharacterIdentifierStart,
166 /* 66 - B */ CharacterIdentifierStart,
167 /* 67 - C */ CharacterIdentifierStart,
168 /* 68 - D */ CharacterIdentifierStart,
169 /* 69 - E */ CharacterIdentifierStart,
170 /* 70 - F */ CharacterIdentifierStart,
171 /* 71 - G */ CharacterIdentifierStart,
172 /* 72 - H */ CharacterIdentifierStart,
173 /* 73 - I */ CharacterIdentifierStart,
174 /* 74 - J */ CharacterIdentifierStart,
175 /* 75 - K */ CharacterIdentifierStart,
176 /* 76 - L */ CharacterIdentifierStart,
177 /* 77 - M */ CharacterIdentifierStart,
178 /* 78 - N */ CharacterIdentifierStart,
179 /* 79 - O */ CharacterIdentifierStart,
180 /* 80 - P */ CharacterIdentifierStart,
181 /* 81 - Q */ CharacterIdentifierStart,
182 /* 82 - R */ CharacterIdentifierStart,
183 /* 83 - S */ CharacterIdentifierStart,
184 /* 84 - T */ CharacterIdentifierStart,
185 /* 85 - U */ CharacterIdentifierStart,
186 /* 86 - V */ CharacterIdentifierStart,
187 /* 87 - W */ CharacterIdentifierStart,
188 /* 88 - X */ CharacterIdentifierStart,
189 /* 89 - Y */ CharacterIdentifierStart,
190 /* 90 - Z */ CharacterIdentifierStart,
191 /* 91 - [ */ CharacterOpenBracket,
192 /* 92 - \ */ CharacterBackSlash,
193 /* 93 - ] */ CharacterCloseBracket,
194 /* 94 - ^ */ CharacterXor,
195 /* 95 - _ */ CharacterIdentifierStart,
196 /* 96 - ` */ CharacterInvalid,
197 /* 97 - a */ CharacterIdentifierStart,
198 /* 98 - b */ CharacterIdentifierStart,
199 /* 99 - c */ CharacterIdentifierStart,
200 /* 100 - d */ CharacterIdentifierStart,
201 /* 101 - e */ CharacterIdentifierStart,
202 /* 102 - f */ CharacterIdentifierStart,
203 /* 103 - g */ CharacterIdentifierStart,
204 /* 104 - h */ CharacterIdentifierStart,
205 /* 105 - i */ CharacterIdentifierStart,
206 /* 106 - j */ CharacterIdentifierStart,
207 /* 107 - k */ CharacterIdentifierStart,
208 /* 108 - l */ CharacterIdentifierStart,
209 /* 109 - m */ CharacterIdentifierStart,
210 /* 110 - n */ CharacterIdentifierStart,
211 /* 111 - o */ CharacterIdentifierStart,
212 /* 112 - p */ CharacterIdentifierStart,
213 /* 113 - q */ CharacterIdentifierStart,
214 /* 114 - r */ CharacterIdentifierStart,
215 /* 115 - s */ CharacterIdentifierStart,
216 /* 116 - t */ CharacterIdentifierStart,
217 /* 117 - u */ CharacterIdentifierStart,
218 /* 118 - v */ CharacterIdentifierStart,
219 /* 119 - w */ CharacterIdentifierStart,
220 /* 120 - x */ CharacterIdentifierStart,
221 /* 121 - y */ CharacterIdentifierStart,
222 /* 122 - z */ CharacterIdentifierStart,
223 /* 123 - { */ CharacterOpenBrace,
224 /* 124 - | */ CharacterOr,
225 /* 125 - } */ CharacterCloseBrace,
226 /* 126 - ~ */ CharacterTilde,
227 /* 127 - Delete */ CharacterInvalid,
228 /* 128 - Cc category */ CharacterInvalid,
229 /* 129 - Cc category */ CharacterInvalid,
230 /* 130 - Cc category */ CharacterInvalid,
231 /* 131 - Cc category */ CharacterInvalid,
232 /* 132 - Cc category */ CharacterInvalid,
233 /* 133 - Cc category */ CharacterInvalid,
234 /* 134 - Cc category */ CharacterInvalid,
235 /* 135 - Cc category */ CharacterInvalid,
236 /* 136 - Cc category */ CharacterInvalid,
237 /* 137 - Cc category */ CharacterInvalid,
238 /* 138 - Cc category */ CharacterInvalid,
239 /* 139 - Cc category */ CharacterInvalid,
240 /* 140 - Cc category */ CharacterInvalid,
241 /* 141 - Cc category */ CharacterInvalid,
242 /* 142 - Cc category */ CharacterInvalid,
243 /* 143 - Cc category */ CharacterInvalid,
244 /* 144 - Cc category */ CharacterInvalid,
245 /* 145 - Cc category */ CharacterInvalid,
246 /* 146 - Cc category */ CharacterInvalid,
247 /* 147 - Cc category */ CharacterInvalid,
248 /* 148 - Cc category */ CharacterInvalid,
249 /* 149 - Cc category */ CharacterInvalid,
250 /* 150 - Cc category */ CharacterInvalid,
251 /* 151 - Cc category */ CharacterInvalid,
252 /* 152 - Cc category */ CharacterInvalid,
253 /* 153 - Cc category */ CharacterInvalid,
254 /* 154 - Cc category */ CharacterInvalid,
255 /* 155 - Cc category */ CharacterInvalid,
256 /* 156 - Cc category */ CharacterInvalid,
257 /* 157 - Cc category */ CharacterInvalid,
258 /* 158 - Cc category */ CharacterInvalid,
259 /* 159 - Cc category */ CharacterInvalid,
260 /* 160 - Zs category (nbsp) */ CharacterWhiteSpace,
261 /* 161 - Po category */ CharacterInvalid,
262 /* 162 - Sc category */ CharacterInvalid,
263 /* 163 - Sc category */ CharacterInvalid,
264 /* 164 - Sc category */ CharacterInvalid,
265 /* 165 - Sc category */ CharacterInvalid,
266 /* 166 - So category */ CharacterInvalid,
267 /* 167 - So category */ CharacterInvalid,
268 /* 168 - Sk category */ CharacterInvalid,
269 /* 169 - So category */ CharacterInvalid,
270 /* 170 - Ll category */ CharacterIdentifierStart,
271 /* 171 - Pi category */ CharacterInvalid,
272 /* 172 - Sm category */ CharacterInvalid,
273 /* 173 - Cf category */ CharacterInvalid,
274 /* 174 - So category */ CharacterInvalid,
275 /* 175 - Sk category */ CharacterInvalid,
276 /* 176 - So category */ CharacterInvalid,
277 /* 177 - Sm category */ CharacterInvalid,
278 /* 178 - No category */ CharacterInvalid,
279 /* 179 - No category */ CharacterInvalid,
280 /* 180 - Sk category */ CharacterInvalid,
281 /* 181 - Ll category */ CharacterIdentifierStart,
282 /* 182 - So category */ CharacterInvalid,
283 /* 183 - Po category */ CharacterInvalid,
284 /* 184 - Sk category */ CharacterInvalid,
285 /* 185 - No category */ CharacterInvalid,
286 /* 186 - Ll category */ CharacterIdentifierStart,
287 /* 187 - Pf category */ CharacterInvalid,
288 /* 188 - No category */ CharacterInvalid,
289 /* 189 - No category */ CharacterInvalid,
290 /* 190 - No category */ CharacterInvalid,
291 /* 191 - Po category */ CharacterInvalid,
292 /* 192 - Lu category */ CharacterIdentifierStart,
293 /* 193 - Lu category */ CharacterIdentifierStart,
294 /* 194 - Lu category */ CharacterIdentifierStart,
295 /* 195 - Lu category */ CharacterIdentifierStart,
296 /* 196 - Lu category */ CharacterIdentifierStart,
297 /* 197 - Lu category */ CharacterIdentifierStart,
298 /* 198 - Lu category */ CharacterIdentifierStart,
299 /* 199 - Lu category */ CharacterIdentifierStart,
300 /* 200 - Lu category */ CharacterIdentifierStart,
301 /* 201 - Lu category */ CharacterIdentifierStart,
302 /* 202 - Lu category */ CharacterIdentifierStart,
303 /* 203 - Lu category */ CharacterIdentifierStart,
304 /* 204 - Lu category */ CharacterIdentifierStart,
305 /* 205 - Lu category */ CharacterIdentifierStart,
306 /* 206 - Lu category */ CharacterIdentifierStart,
307 /* 207 - Lu category */ CharacterIdentifierStart,
308 /* 208 - Lu category */ CharacterIdentifierStart,
309 /* 209 - Lu category */ CharacterIdentifierStart,
310 /* 210 - Lu category */ CharacterIdentifierStart,
311 /* 211 - Lu category */ CharacterIdentifierStart,
312 /* 212 - Lu category */ CharacterIdentifierStart,
313 /* 213 - Lu category */ CharacterIdentifierStart,
314 /* 214 - Lu category */ CharacterIdentifierStart,
315 /* 215 - Sm category */ CharacterInvalid,
316 /* 216 - Lu category */ CharacterIdentifierStart,
317 /* 217 - Lu category */ CharacterIdentifierStart,
318 /* 218 - Lu category */ CharacterIdentifierStart,
319 /* 219 - Lu category */ CharacterIdentifierStart,
320 /* 220 - Lu category */ CharacterIdentifierStart,
321 /* 221 - Lu category */ CharacterIdentifierStart,
322 /* 222 - Lu category */ CharacterIdentifierStart,
323 /* 223 - Ll category */ CharacterIdentifierStart,
324 /* 224 - Ll category */ CharacterIdentifierStart,
325 /* 225 - Ll category */ CharacterIdentifierStart,
326 /* 226 - Ll category */ CharacterIdentifierStart,
327 /* 227 - Ll category */ CharacterIdentifierStart,
328 /* 228 - Ll category */ CharacterIdentifierStart,
329 /* 229 - Ll category */ CharacterIdentifierStart,
330 /* 230 - Ll category */ CharacterIdentifierStart,
331 /* 231 - Ll category */ CharacterIdentifierStart,
332 /* 232 - Ll category */ CharacterIdentifierStart,
333 /* 233 - Ll category */ CharacterIdentifierStart,
334 /* 234 - Ll category */ CharacterIdentifierStart,
335 /* 235 - Ll category */ CharacterIdentifierStart,
336 /* 236 - Ll category */ CharacterIdentifierStart,
337 /* 237 - Ll category */ CharacterIdentifierStart,
338 /* 238 - Ll category */ CharacterIdentifierStart,
339 /* 239 - Ll category */ CharacterIdentifierStart,
340 /* 240 - Ll category */ CharacterIdentifierStart,
341 /* 241 - Ll category */ CharacterIdentifierStart,
342 /* 242 - Ll category */ CharacterIdentifierStart,
343 /* 243 - Ll category */ CharacterIdentifierStart,
344 /* 244 - Ll category */ CharacterIdentifierStart,
345 /* 245 - Ll category */ CharacterIdentifierStart,
346 /* 246 - Ll category */ CharacterIdentifierStart,
347 /* 247 - Sm category */ CharacterInvalid,
348 /* 248 - Ll category */ CharacterIdentifierStart,
349 /* 249 - Ll category */ CharacterIdentifierStart,
350 /* 250 - Ll category */ CharacterIdentifierStart,
351 /* 251 - Ll category */ CharacterIdentifierStart,
352 /* 252 - Ll category */ CharacterIdentifierStart,
353 /* 253 - Ll category */ CharacterIdentifierStart,
354 /* 254 - Ll category */ CharacterIdentifierStart,
355 /* 255 - Ll category */ CharacterIdentifierStart
356 };
357
358 // This table provides the character that results from \X where X is the index in the table beginning
359 // with SPACE. A table value of 0 means that more processing needs to be done.
360 static const LChar singleCharacterEscapeValuesForASCII[128] = {
361 /* 0 - Null */ 0,
362 /* 1 - Start of Heading */ 0,
363 /* 2 - Start of Text */ 0,
364 /* 3 - End of Text */ 0,
365 /* 4 - End of Transm. */ 0,
366 /* 5 - Enquiry */ 0,
367 /* 6 - Acknowledgment */ 0,
368 /* 7 - Bell */ 0,
369 /* 8 - Back Space */ 0,
370 /* 9 - Horizontal Tab */ 0,
371 /* 10 - Line Feed */ 0,
372 /* 11 - Vertical Tab */ 0,
373 /* 12 - Form Feed */ 0,
374 /* 13 - Carriage Return */ 0,
375 /* 14 - Shift Out */ 0,
376 /* 15 - Shift In */ 0,
377 /* 16 - Data Line Escape */ 0,
378 /* 17 - Device Control 1 */ 0,
379 /* 18 - Device Control 2 */ 0,
380 /* 19 - Device Control 3 */ 0,
381 /* 20 - Device Control 4 */ 0,
382 /* 21 - Negative Ack. */ 0,
383 /* 22 - Synchronous Idle */ 0,
384 /* 23 - End of Transmit */ 0,
385 /* 24 - Cancel */ 0,
386 /* 25 - End of Medium */ 0,
387 /* 26 - Substitute */ 0,
388 /* 27 - Escape */ 0,
389 /* 28 - File Separator */ 0,
390 /* 29 - Group Separator */ 0,
391 /* 30 - Record Separator */ 0,
392 /* 31 - Unit Separator */ 0,
393 /* 32 - Space */ ' ',
394 /* 33 - ! */ '!',
395 /* 34 - " */ '"',
396 /* 35 - # */ '#',
397 /* 36 - $ */ '$',
398 /* 37 - % */ '%',
399 /* 38 - & */ '&',
400 /* 39 - ' */ '\'',
401 /* 40 - ( */ '(',
402 /* 41 - ) */ ')',
403 /* 42 - * */ '*',
404 /* 43 - + */ '+',
405 /* 44 - , */ ',',
406 /* 45 - - */ '-',
407 /* 46 - . */ '.',
408 /* 47 - / */ '/',
409 /* 48 - 0 */ 0,
410 /* 49 - 1 */ 0,
411 /* 50 - 2 */ 0,
412 /* 51 - 3 */ 0,
413 /* 52 - 4 */ 0,
414 /* 53 - 5 */ 0,
415 /* 54 - 6 */ 0,
416 /* 55 - 7 */ 0,
417 /* 56 - 8 */ 0,
418 /* 57 - 9 */ 0,
419 /* 58 - : */ ':',
420 /* 59 - ; */ ';',
421 /* 60 - < */ '<',
422 /* 61 - = */ '=',
423 /* 62 - > */ '>',
424 /* 63 - ? */ '?',
425 /* 64 - @ */ '@',
426 /* 65 - A */ 'A',
427 /* 66 - B */ 'B',
428 /* 67 - C */ 'C',
429 /* 68 - D */ 'D',
430 /* 69 - E */ 'E',
431 /* 70 - F */ 'F',
432 /* 71 - G */ 'G',
433 /* 72 - H */ 'H',
434 /* 73 - I */ 'I',
435 /* 74 - J */ 'J',
436 /* 75 - K */ 'K',
437 /* 76 - L */ 'L',
438 /* 77 - M */ 'M',
439 /* 78 - N */ 'N',
440 /* 79 - O */ 'O',
441 /* 80 - P */ 'P',
442 /* 81 - Q */ 'Q',
443 /* 82 - R */ 'R',
444 /* 83 - S */ 'S',
445 /* 84 - T */ 'T',
446 /* 85 - U */ 'U',
447 /* 86 - V */ 'V',
448 /* 87 - W */ 'W',
449 /* 88 - X */ 'X',
450 /* 89 - Y */ 'Y',
451 /* 90 - Z */ 'Z',
452 /* 91 - [ */ '[',
453 /* 92 - \ */ '\\',
454 /* 93 - ] */ ']',
455 /* 94 - ^ */ '^',
456 /* 95 - _ */ '_',
457 /* 96 - ` */ '`',
458 /* 97 - a */ 'a',
459 /* 98 - b */ 0x08,
460 /* 99 - c */ 'c',
461 /* 100 - d */ 'd',
462 /* 101 - e */ 'e',
463 /* 102 - f */ 0x0C,
464 /* 103 - g */ 'g',
465 /* 104 - h */ 'h',
466 /* 105 - i */ 'i',
467 /* 106 - j */ 'j',
468 /* 107 - k */ 'k',
469 /* 108 - l */ 'l',
470 /* 109 - m */ 'm',
471 /* 110 - n */ 0x0A,
472 /* 111 - o */ 'o',
473 /* 112 - p */ 'p',
474 /* 113 - q */ 'q',
475 /* 114 - r */ 0x0D,
476 /* 115 - s */ 's',
477 /* 116 - t */ 0x09,
478 /* 117 - u */ 0,
479 /* 118 - v */ 0x0B,
480 /* 119 - w */ 'w',
481 /* 120 - x */ 0,
482 /* 121 - y */ 'y',
483 /* 122 - z */ 'z',
484 /* 123 - { */ '{',
485 /* 124 - | */ '|',
486 /* 125 - } */ '}',
487 /* 126 - ~ */ '~',
488 /* 127 - Delete */ 0
489 };
490
491 template <typename T>
492 Lexer<T>::Lexer(VM* vm)
493 : m_isReparsing(false)
494 , m_vm(vm)
495 {
496 }
497
498 template <typename T>
499 Lexer<T>::~Lexer()
500 {
501 }
502
503 template <typename T>
504 String Lexer<T>::invalidCharacterMessage() const
505 {
506 switch (m_current) {
507 case 0:
508 return "Invalid character: '\\0'";
509 case 10:
510 return "Invalid character: '\\n'";
511 case 11:
512 return "Invalid character: '\\v'";
513 case 13:
514 return "Invalid character: '\\r'";
515 case 35:
516 return "Invalid character: '#'";
517 case 64:
518 return "Invalid character: '@'";
519 case 96:
520 return "Invalid character: '`'";
521 default:
522 return String::format("Invalid character '\\u%04u'", static_cast<unsigned>(m_current)).impl();
523 }
524 }
525
526 template <typename T>
527 ALWAYS_INLINE const T* Lexer<T>::currentSourcePtr() const
528 {
529 ASSERT(m_code <= m_codeEnd);
530 return m_code;
531 }
532
533 template <typename T>
534 void Lexer<T>::setCode(const SourceCode& source, ParserArena* arena)
535 {
536 m_arena = &arena->identifierArena();
537
538 m_lineNumber = source.firstLine();
539 m_lastToken = -1;
540
541 const String& sourceString = source.provider()->source();
542
543 if (!sourceString.isNull())
544 setCodeStart(sourceString.impl());
545 else
546 m_codeStart = 0;
547
548 m_source = &source;
549 m_sourceOffset = source.startOffset();
550 m_codeStartPlusOffset = m_codeStart + source.startOffset();
551 m_code = m_codeStartPlusOffset;
552 m_codeEnd = m_codeStart + source.endOffset();
553 m_error = false;
554 m_atLineStart = true;
555 m_lineStart = m_code;
556 m_lexErrorMessage = String();
557
558 m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
559 m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
560
561 if (LIKELY(m_code < m_codeEnd))
562 m_current = *m_code;
563 else
564 m_current = 0;
565 ASSERT(currentOffset() == source.startOffset());
566 }
567
568 template <typename T>
569 template <int shiftAmount> ALWAYS_INLINE void Lexer<T>::internalShift()
570 {
571 m_code += shiftAmount;
572 ASSERT(currentOffset() >= currentLineStartOffset());
573 m_current = *m_code;
574 }
575
576 template <typename T>
577 ALWAYS_INLINE void Lexer<T>::shift()
578 {
579 // At one point timing showed that setting m_current to 0 unconditionally was faster than an if-else sequence.
580 m_current = 0;
581 ++m_code;
582 if (LIKELY(m_code < m_codeEnd))
583 m_current = *m_code;
584 }
585
586 template <typename T>
587 ALWAYS_INLINE bool Lexer<T>::atEnd() const
588 {
589 ASSERT(!m_current || m_code < m_codeEnd);
590 return UNLIKELY(UNLIKELY(!m_current) && m_code == m_codeEnd);
591 }
592
593 template <typename T>
594 ALWAYS_INLINE T Lexer<T>::peek(int offset) const
595 {
596 ASSERT(offset > 0 && offset < 5);
597 const T* code = m_code + offset;
598 return (code < m_codeEnd) ? *code : 0;
599 }
600
601 template <typename T>
602 typename Lexer<T>::UnicodeHexValue Lexer<T>::parseFourDigitUnicodeHex()
603 {
604 T char1 = peek(1);
605 T char2 = peek(2);
606 T char3 = peek(3);
607
608 if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
609 return UnicodeHexValue((m_code + 4) >= m_codeEnd ? UnicodeHexValue::IncompleteHex : UnicodeHexValue::InvalidHex);
610
611 int result = convertUnicode(m_current, char1, char2, char3);
612 shift();
613 shift();
614 shift();
615 shift();
616 return UnicodeHexValue(result);
617 }
618
619 template <typename T>
620 void Lexer<T>::shiftLineTerminator()
621 {
622 ASSERT(isLineTerminator(m_current));
623
624 T prev = m_current;
625 shift();
626
627 // Allow both CRLF and LFCR.
628 if (prev + m_current == '\n' + '\r')
629 shift();
630
631 ++m_lineNumber;
632 }
633
634 template <typename T>
635 ALWAYS_INLINE bool Lexer<T>::lastTokenWasRestrKeyword() const
636 {
637 return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
638 }
639
640 static NEVER_INLINE bool isNonLatin1IdentStart(int c)
641 {
642 return category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other);
643 }
644
645 static ALWAYS_INLINE bool isLatin1(LChar)
646 {
647 return true;
648 }
649
650 static ALWAYS_INLINE bool isLatin1(UChar c)
651 {
652 return c < 256;
653 }
654
655 static inline bool isIdentStart(LChar c)
656 {
657 return typesOfLatin1Characters[c] == CharacterIdentifierStart;
658 }
659
660 static inline bool isIdentStart(UChar c)
661 {
662 return isLatin1(c) ? isIdentStart(static_cast<LChar>(c)) : isNonLatin1IdentStart(c);
663 }
664
665 static NEVER_INLINE bool isNonLatin1IdentPart(int c)
666 {
667 return (category(c) & (Letter_Uppercase | Letter_Lowercase | Letter_Titlecase | Letter_Modifier | Letter_Other
668 | Mark_NonSpacing | Mark_SpacingCombining | Number_DecimalDigit | Punctuation_Connector)) || c == 0x200C || c == 0x200D;
669 }
670
671 static ALWAYS_INLINE bool isIdentPart(LChar c)
672 {
673 // Character types are divided into two groups depending on whether they can be part of an
674 // identifier or not. Those whose type value is less or equal than CharacterNumber can be
675 // part of an identifier. (See the CharacterType definition for more details.)
676 return typesOfLatin1Characters[c] <= CharacterNumber;
677 }
678
679 static ALWAYS_INLINE bool isIdentPart(UChar c)
680 {
681 return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
682 }
683
684 static inline LChar singleEscape(int c)
685 {
686 if (c < 128) {
687 ASSERT(static_cast<size_t>(c) < ARRAY_SIZE(singleCharacterEscapeValuesForASCII));
688 return singleCharacterEscapeValuesForASCII[c];
689 }
690 return 0;
691 }
692
693 template <typename T>
694 inline void Lexer<T>::record8(int c)
695 {
696 ASSERT(c >= 0);
697 ASSERT(c <= 0xFF);
698 m_buffer8.append(static_cast<LChar>(c));
699 }
700
701 template <typename T>
702 inline void assertCharIsIn8BitRange(T c)
703 {
704 UNUSED_PARAM(c);
705 ASSERT(c >= 0);
706 ASSERT(c <= 0xFF);
707 }
708
709 template <>
710 inline void assertCharIsIn8BitRange(UChar c)
711 {
712 UNUSED_PARAM(c);
713 ASSERT(c <= 0xFF);
714 }
715
716 template <>
717 inline void assertCharIsIn8BitRange(LChar)
718 {
719 }
720
721 template <typename T>
722 inline void Lexer<T>::append8(const T* p, size_t length)
723 {
724 size_t currentSize = m_buffer8.size();
725 m_buffer8.grow(currentSize + length);
726 LChar* rawBuffer = m_buffer8.data() + currentSize;
727
728 for (size_t i = 0; i < length; i++) {
729 T c = p[i];
730 assertCharIsIn8BitRange(c);
731 rawBuffer[i] = c;
732 }
733 }
734
735 template <typename T>
736 inline void Lexer<T>::append16(const LChar* p, size_t length)
737 {
738 size_t currentSize = m_buffer16.size();
739 m_buffer16.grow(currentSize + length);
740 UChar* rawBuffer = m_buffer16.data() + currentSize;
741
742 for (size_t i = 0; i < length; i++)
743 rawBuffer[i] = p[i];
744 }
745
746 template <typename T>
747 inline void Lexer<T>::record16(T c)
748 {
749 m_buffer16.append(c);
750 }
751
752 template <typename T>
753 inline void Lexer<T>::record16(int c)
754 {
755 ASSERT(c >= 0);
756 ASSERT(c <= static_cast<int>(USHRT_MAX));
757 m_buffer16.append(static_cast<UChar>(c));
758 }
759
760 template <>
761 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<LChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
762 {
763 const ptrdiff_t remaining = m_codeEnd - m_code;
764 if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
765 JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
766 if (keyword != IDENT) {
767 ASSERT((!shouldCreateIdentifier) || tokenData->ident);
768 return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
769 }
770 }
771
772 const LChar* identifierStart = currentSourcePtr();
773 unsigned identifierLineStart = currentLineStartOffset();
774
775 while (isIdentPart(m_current))
776 shift();
777
778 if (UNLIKELY(m_current == '\\')) {
779 setOffsetFromSourcePtr(identifierStart, identifierLineStart);
780 return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
781 }
782
783 const Identifier* ident = 0;
784
785 if (shouldCreateIdentifier) {
786 int identifierLength = currentSourcePtr() - identifierStart;
787 ident = makeIdentifier(identifierStart, identifierLength);
788
789 tokenData->ident = ident;
790 } else
791 tokenData->ident = 0;
792
793 if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
794 ASSERT(shouldCreateIdentifier);
795 if (remaining < maxTokenLength) {
796 const HashEntry* entry = m_vm->keywords->getKeyword(*ident);
797 ASSERT((remaining < maxTokenLength) || !entry);
798 if (!entry)
799 return IDENT;
800 JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
801 return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
802 }
803 return IDENT;
804 }
805
806 return IDENT;
807 }
808
809 template <>
810 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<UChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
811 {
812 const ptrdiff_t remaining = m_codeEnd - m_code;
813 if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
814 JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
815 if (keyword != IDENT) {
816 ASSERT((!shouldCreateIdentifier) || tokenData->ident);
817 return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
818 }
819 }
820
821 const UChar* identifierStart = currentSourcePtr();
822 int identifierLineStart = currentLineStartOffset();
823
824 UChar orAllChars = 0;
825
826 while (isIdentPart(m_current)) {
827 orAllChars |= m_current;
828 shift();
829 }
830
831 if (UNLIKELY(m_current == '\\')) {
832 setOffsetFromSourcePtr(identifierStart, identifierLineStart);
833 return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
834 }
835
836 bool isAll8Bit = false;
837
838 if (!(orAllChars & ~0xff))
839 isAll8Bit = true;
840
841 const Identifier* ident = 0;
842
843 if (shouldCreateIdentifier) {
844 int identifierLength = currentSourcePtr() - identifierStart;
845 if (isAll8Bit)
846 ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
847 else
848 ident = makeIdentifier(identifierStart, identifierLength);
849
850 tokenData->ident = ident;
851 } else
852 tokenData->ident = 0;
853
854 if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
855 ASSERT(shouldCreateIdentifier);
856 if (remaining < maxTokenLength) {
857 const HashEntry* entry = m_vm->keywords->getKeyword(*ident);
858 ASSERT((remaining < maxTokenLength) || !entry);
859 if (!entry)
860 return IDENT;
861 JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
862 return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
863 }
864 return IDENT;
865 }
866
867 return IDENT;
868 }
869
870 template <typename T>
871 template <bool shouldCreateIdentifier> JSTokenType Lexer<T>::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
872 {
873 const ptrdiff_t remaining = m_codeEnd - m_code;
874 const T* identifierStart = currentSourcePtr();
875 bool bufferRequired = false;
876
877 while (true) {
878 if (LIKELY(isIdentPart(m_current))) {
879 shift();
880 continue;
881 }
882 if (LIKELY(m_current != '\\'))
883 break;
884
885 // \uXXXX unicode characters.
886 bufferRequired = true;
887 if (identifierStart != currentSourcePtr())
888 m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
889 shift();
890 if (UNLIKELY(m_current != 'u'))
891 return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
892 shift();
893 UnicodeHexValue character = parseFourDigitUnicodeHex();
894 if (UNLIKELY(!character.isValid()))
895 return character.valueType() == UnicodeHexValue::IncompleteHex ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
896 UChar ucharacter = static_cast<UChar>(character.value());
897 if (UNLIKELY(m_buffer16.size() ? !isIdentPart(ucharacter) : !isIdentStart(ucharacter)))
898 return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
899 if (shouldCreateIdentifier)
900 record16(ucharacter);
901 identifierStart = currentSourcePtr();
902 }
903
904 int identifierLength;
905 const Identifier* ident = 0;
906 if (shouldCreateIdentifier) {
907 if (!bufferRequired) {
908 identifierLength = currentSourcePtr() - identifierStart;
909 ident = makeIdentifier(identifierStart, identifierLength);
910 } else {
911 if (identifierStart != currentSourcePtr())
912 m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
913 ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
914 }
915
916 tokenData->ident = ident;
917 } else
918 tokenData->ident = 0;
919
920 if (LIKELY(!bufferRequired && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
921 ASSERT(shouldCreateIdentifier);
922 // Keywords must not be recognized if there was an \uXXXX in the identifier.
923 if (remaining < maxTokenLength) {
924 const HashEntry* entry = m_vm->keywords->getKeyword(*ident);
925 ASSERT((remaining < maxTokenLength) || !entry);
926 if (!entry)
927 return IDENT;
928 JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
929 return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
930 }
931 return IDENT;
932 }
933
934 m_buffer16.resize(0);
935 return IDENT;
936 }
937
938 static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(LChar character)
939 {
940 return character < 0xE;
941 }
942
943 static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(UChar character)
944 {
945 return character < 0xE || character > 0xFF;
946 }
947
948 template <typename T>
949 template <bool shouldBuildStrings> ALWAYS_INLINE typename Lexer<T>::StringParseResult Lexer<T>::parseString(JSTokenData* tokenData, bool strictMode)
950 {
951 int startingOffset = currentOffset();
952 int startingLineStartOffset = currentLineStartOffset();
953 int startingLineNumber = lineNumber();
954 T stringQuoteCharacter = m_current;
955 shift();
956
957 const T* stringStart = currentSourcePtr();
958
959 while (m_current != stringQuoteCharacter) {
960 if (UNLIKELY(m_current == '\\')) {
961 if (stringStart != currentSourcePtr() && shouldBuildStrings)
962 append8(stringStart, currentSourcePtr() - stringStart);
963 shift();
964
965 LChar escape = singleEscape(m_current);
966
967 // Most common escape sequences first
968 if (escape) {
969 if (shouldBuildStrings)
970 record8(escape);
971 shift();
972 } else if (UNLIKELY(isLineTerminator(m_current)))
973 shiftLineTerminator();
974 else if (m_current == 'x') {
975 shift();
976 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
977 m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
978 return (atEnd() || (isASCIIHexDigit(m_current) && (m_code + 1 == m_codeEnd))) ? StringUnterminated : StringCannotBeParsed;
979 }
980 T prev = m_current;
981 shift();
982 if (shouldBuildStrings)
983 record8(convertHex(prev, m_current));
984 shift();
985 } else {
986 setOffset(startingOffset, startingLineStartOffset);
987 setLineNumber(startingLineNumber);
988 m_buffer8.resize(0);
989 return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
990 }
991 stringStart = currentSourcePtr();
992 continue;
993 }
994
995 if (UNLIKELY(characterRequiresParseStringSlowCase(m_current))) {
996 setOffset(startingOffset, startingLineStartOffset);
997 setLineNumber(startingLineNumber);
998 m_buffer8.resize(0);
999 return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
1000 }
1001
1002 shift();
1003 }
1004
1005 if (currentSourcePtr() != stringStart && shouldBuildStrings)
1006 append8(stringStart, currentSourcePtr() - stringStart);
1007 if (shouldBuildStrings) {
1008 tokenData->ident = makeIdentifier(m_buffer8.data(), m_buffer8.size());
1009 m_buffer8.resize(0);
1010 } else
1011 tokenData->ident = 0;
1012
1013 return StringParsedSuccessfully;
1014 }
1015
1016 template <typename T>
1017 template <bool shouldBuildStrings> typename Lexer<T>::StringParseResult Lexer<T>::parseStringSlowCase(JSTokenData* tokenData, bool strictMode)
1018 {
1019 T stringQuoteCharacter = m_current;
1020 shift();
1021
1022 const T* stringStart = currentSourcePtr();
1023
1024 while (m_current != stringQuoteCharacter) {
1025 if (UNLIKELY(m_current == '\\')) {
1026 if (stringStart != currentSourcePtr() && shouldBuildStrings)
1027 append16(stringStart, currentSourcePtr() - stringStart);
1028 shift();
1029
1030 LChar escape = singleEscape(m_current);
1031
1032 // Most common escape sequences first
1033 if (escape) {
1034 if (shouldBuildStrings)
1035 record16(escape);
1036 shift();
1037 } else if (UNLIKELY(isLineTerminator(m_current)))
1038 shiftLineTerminator();
1039 else if (m_current == 'x') {
1040 shift();
1041 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
1042 m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
1043 return StringCannotBeParsed;
1044 }
1045 T prev = m_current;
1046 shift();
1047 if (shouldBuildStrings)
1048 record16(convertHex(prev, m_current));
1049 shift();
1050 } else if (m_current == 'u') {
1051 shift();
1052 UnicodeHexValue character = parseFourDigitUnicodeHex();
1053 if (character.isValid()) {
1054 if (shouldBuildStrings)
1055 record16(character.value());
1056 } else if (m_current == stringQuoteCharacter) {
1057 if (shouldBuildStrings)
1058 record16('u');
1059 } else {
1060 m_lexErrorMessage = "\\u can only be followed by a Unicode character sequence";
1061 return character.valueType() == UnicodeHexValue::IncompleteHex ? StringUnterminated : StringCannotBeParsed;
1062 }
1063 } else if (strictMode && isASCIIDigit(m_current)) {
1064 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
1065 int character1 = m_current;
1066 shift();
1067 if (character1 != '0' || isASCIIDigit(m_current)) {
1068 m_lexErrorMessage = "The only valid numeric escape in strict mode is '\\0'";
1069 return StringCannotBeParsed;
1070 }
1071 if (shouldBuildStrings)
1072 record16(0);
1073 } else if (!strictMode && isASCIIOctalDigit(m_current)) {
1074 // Octal character sequences
1075 T character1 = m_current;
1076 shift();
1077 if (isASCIIOctalDigit(m_current)) {
1078 // Two octal characters
1079 T character2 = m_current;
1080 shift();
1081 if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
1082 if (shouldBuildStrings)
1083 record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
1084 shift();
1085 } else {
1086 if (shouldBuildStrings)
1087 record16((character1 - '0') * 8 + character2 - '0');
1088 }
1089 } else {
1090 if (shouldBuildStrings)
1091 record16(character1 - '0');
1092 }
1093 } else if (!atEnd()) {
1094 if (shouldBuildStrings)
1095 record16(m_current);
1096 shift();
1097 } else {
1098 m_lexErrorMessage = "Unterminated string constant";
1099 return StringUnterminated;
1100 }
1101
1102 stringStart = currentSourcePtr();
1103 continue;
1104 }
1105 // Fast check for characters that require special handling.
1106 // Catches 0, \n, \r, 0x2028, and 0x2029 as efficiently
1107 // as possible, and lets through all common ASCII characters.
1108 if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
1109 // New-line or end of input is not allowed
1110 if (atEnd() || isLineTerminator(m_current)) {
1111 m_lexErrorMessage = "Unexpected EOF";
1112 return atEnd() ? StringUnterminated : StringCannotBeParsed;
1113 }
1114 // Anything else is just a normal character
1115 }
1116 shift();
1117 }
1118
1119 if (currentSourcePtr() != stringStart && shouldBuildStrings)
1120 append16(stringStart, currentSourcePtr() - stringStart);
1121 if (shouldBuildStrings)
1122 tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1123 else
1124 tokenData->ident = 0;
1125
1126 m_buffer16.resize(0);
1127 return StringParsedSuccessfully;
1128 }
1129
1130 template <typename T>
1131 ALWAYS_INLINE void Lexer<T>::parseHex(double& returnValue)
1132 {
1133 // Optimization: most hexadecimal values fit into 4 bytes.
1134 uint32_t hexValue = 0;
1135 int maximumDigits = 7;
1136
1137 // Shift out the 'x' prefix.
1138 shift();
1139
1140 do {
1141 hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
1142 shift();
1143 --maximumDigits;
1144 } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
1145
1146 if (maximumDigits >= 0) {
1147 returnValue = hexValue;
1148 return;
1149 }
1150
1151 // No more place in the hexValue buffer.
1152 // The values are shifted out and placed into the m_buffer8 vector.
1153 for (int i = 0; i < 8; ++i) {
1154 int digit = hexValue >> 28;
1155 if (digit < 10)
1156 record8(digit + '0');
1157 else
1158 record8(digit - 10 + 'a');
1159 hexValue <<= 4;
1160 }
1161
1162 while (isASCIIHexDigit(m_current)) {
1163 record8(m_current);
1164 shift();
1165 }
1166
1167 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
1168 }
1169
1170 template <typename T>
1171 ALWAYS_INLINE bool Lexer<T>::parseOctal(double& returnValue)
1172 {
1173 // Optimization: most octal values fit into 4 bytes.
1174 uint32_t octalValue = 0;
1175 int maximumDigits = 9;
1176 // Temporary buffer for the digits. Makes easier
1177 // to reconstruct the input characters when needed.
1178 LChar digits[10];
1179
1180 do {
1181 octalValue = octalValue * 8 + (m_current - '0');
1182 digits[maximumDigits] = m_current;
1183 shift();
1184 --maximumDigits;
1185 } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
1186
1187 if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
1188 returnValue = octalValue;
1189 return true;
1190 }
1191
1192 for (int i = 9; i > maximumDigits; --i)
1193 record8(digits[i]);
1194
1195 while (isASCIIOctalDigit(m_current)) {
1196 record8(m_current);
1197 shift();
1198 }
1199
1200 if (isASCIIDigit(m_current))
1201 return false;
1202
1203 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
1204 return true;
1205 }
1206
1207 template <typename T>
1208 ALWAYS_INLINE bool Lexer<T>::parseDecimal(double& returnValue)
1209 {
1210 // Optimization: most decimal values fit into 4 bytes.
1211 uint32_t decimalValue = 0;
1212
1213 // Since parseOctal may be executed before parseDecimal,
1214 // the m_buffer8 may hold ascii digits.
1215 if (!m_buffer8.size()) {
1216 int maximumDigits = 9;
1217 // Temporary buffer for the digits. Makes easier
1218 // to reconstruct the input characters when needed.
1219 LChar digits[10];
1220
1221 do {
1222 decimalValue = decimalValue * 10 + (m_current - '0');
1223 digits[maximumDigits] = m_current;
1224 shift();
1225 --maximumDigits;
1226 } while (isASCIIDigit(m_current) && maximumDigits >= 0);
1227
1228 if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
1229 returnValue = decimalValue;
1230 return true;
1231 }
1232
1233 for (int i = 9; i > maximumDigits; --i)
1234 record8(digits[i]);
1235 }
1236
1237 while (isASCIIDigit(m_current)) {
1238 record8(m_current);
1239 shift();
1240 }
1241
1242 return false;
1243 }
1244
1245 template <typename T>
1246 ALWAYS_INLINE void Lexer<T>::parseNumberAfterDecimalPoint()
1247 {
1248 record8('.');
1249 while (isASCIIDigit(m_current)) {
1250 record8(m_current);
1251 shift();
1252 }
1253 }
1254
1255 template <typename T>
1256 ALWAYS_INLINE bool Lexer<T>::parseNumberAfterExponentIndicator()
1257 {
1258 record8('e');
1259 shift();
1260 if (m_current == '+' || m_current == '-') {
1261 record8(m_current);
1262 shift();
1263 }
1264
1265 if (!isASCIIDigit(m_current))
1266 return false;
1267
1268 do {
1269 record8(m_current);
1270 shift();
1271 } while (isASCIIDigit(m_current));
1272 return true;
1273 }
1274
1275 template <typename T>
1276 ALWAYS_INLINE bool Lexer<T>::parseMultilineComment()
1277 {
1278 while (true) {
1279 while (UNLIKELY(m_current == '*')) {
1280 shift();
1281 if (m_current == '/') {
1282 shift();
1283 return true;
1284 }
1285 }
1286
1287 if (atEnd())
1288 return false;
1289
1290 if (isLineTerminator(m_current)) {
1291 shiftLineTerminator();
1292 m_terminator = true;
1293 } else
1294 shift();
1295 }
1296 }
1297
1298 template <typename T>
1299 bool Lexer<T>::nextTokenIsColon()
1300 {
1301 const T* code = m_code;
1302 while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
1303 code++;
1304
1305 return code < m_codeEnd && *code == ':';
1306 }
1307
1308 template <typename T>
1309 JSTokenType Lexer<T>::lex(JSTokenData* tokenData, JSTokenLocation* tokenLocation, unsigned lexerFlags, bool strictMode)
1310 {
1311 ASSERT(!m_error);
1312 ASSERT(m_buffer8.isEmpty());
1313 ASSERT(m_buffer16.isEmpty());
1314
1315 JSTokenType token = ERRORTOK;
1316 m_terminator = false;
1317
1318 start:
1319 while (isWhiteSpace(m_current))
1320 shift();
1321
1322 if (atEnd())
1323 return EOFTOK;
1324
1325 tokenLocation->startOffset = currentOffset();
1326 ASSERT(currentOffset() >= currentLineStartOffset());
1327
1328 CharacterType type;
1329 if (LIKELY(isLatin1(m_current)))
1330 type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
1331 else if (isNonLatin1IdentStart(m_current))
1332 type = CharacterIdentifierStart;
1333 else if (isLineTerminator(m_current))
1334 type = CharacterLineTerminator;
1335 else
1336 type = CharacterInvalid;
1337
1338 switch (type) {
1339 case CharacterGreater:
1340 shift();
1341 if (m_current == '>') {
1342 shift();
1343 if (m_current == '>') {
1344 shift();
1345 if (m_current == '=') {
1346 shift();
1347 token = URSHIFTEQUAL;
1348 break;
1349 }
1350 token = URSHIFT;
1351 break;
1352 }
1353 if (m_current == '=') {
1354 shift();
1355 token = RSHIFTEQUAL;
1356 break;
1357 }
1358 token = RSHIFT;
1359 break;
1360 }
1361 if (m_current == '=') {
1362 shift();
1363 token = GE;
1364 break;
1365 }
1366 token = GT;
1367 break;
1368 case CharacterEqual:
1369 shift();
1370 if (m_current == '=') {
1371 shift();
1372 if (m_current == '=') {
1373 shift();
1374 token = STREQ;
1375 break;
1376 }
1377 token = EQEQ;
1378 break;
1379 }
1380 token = EQUAL;
1381 break;
1382 case CharacterLess:
1383 shift();
1384 if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
1385 // <!-- marks the beginning of a line comment (for www usage)
1386 goto inSingleLineComment;
1387 }
1388 if (m_current == '<') {
1389 shift();
1390 if (m_current == '=') {
1391 shift();
1392 token = LSHIFTEQUAL;
1393 break;
1394 }
1395 token = LSHIFT;
1396 break;
1397 }
1398 if (m_current == '=') {
1399 shift();
1400 token = LE;
1401 break;
1402 }
1403 token = LT;
1404 break;
1405 case CharacterExclamationMark:
1406 shift();
1407 if (m_current == '=') {
1408 shift();
1409 if (m_current == '=') {
1410 shift();
1411 token = STRNEQ;
1412 break;
1413 }
1414 token = NE;
1415 break;
1416 }
1417 token = EXCLAMATION;
1418 break;
1419 case CharacterAdd:
1420 shift();
1421 if (m_current == '+') {
1422 shift();
1423 token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
1424 break;
1425 }
1426 if (m_current == '=') {
1427 shift();
1428 token = PLUSEQUAL;
1429 break;
1430 }
1431 token = PLUS;
1432 break;
1433 case CharacterSub:
1434 shift();
1435 if (m_current == '-') {
1436 shift();
1437 if (m_atLineStart && m_current == '>') {
1438 shift();
1439 goto inSingleLineComment;
1440 }
1441 token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
1442 break;
1443 }
1444 if (m_current == '=') {
1445 shift();
1446 token = MINUSEQUAL;
1447 break;
1448 }
1449 token = MINUS;
1450 break;
1451 case CharacterMultiply:
1452 shift();
1453 if (m_current == '=') {
1454 shift();
1455 token = MULTEQUAL;
1456 break;
1457 }
1458 token = TIMES;
1459 break;
1460 case CharacterSlash:
1461 shift();
1462 if (m_current == '/') {
1463 shift();
1464 goto inSingleLineComment;
1465 }
1466 if (m_current == '*') {
1467 shift();
1468 if (parseMultilineComment())
1469 goto start;
1470 m_lexErrorMessage = "Multiline comment was not closed properly";
1471 token = UNTERMINATED_MULTILINE_COMMENT_ERRORTOK;
1472 goto returnError;
1473 }
1474 if (m_current == '=') {
1475 shift();
1476 token = DIVEQUAL;
1477 break;
1478 }
1479 token = DIVIDE;
1480 break;
1481 case CharacterAnd:
1482 shift();
1483 if (m_current == '&') {
1484 shift();
1485 token = AND;
1486 break;
1487 }
1488 if (m_current == '=') {
1489 shift();
1490 token = ANDEQUAL;
1491 break;
1492 }
1493 token = BITAND;
1494 break;
1495 case CharacterXor:
1496 shift();
1497 if (m_current == '=') {
1498 shift();
1499 token = XOREQUAL;
1500 break;
1501 }
1502 token = BITXOR;
1503 break;
1504 case CharacterModulo:
1505 shift();
1506 if (m_current == '=') {
1507 shift();
1508 token = MODEQUAL;
1509 break;
1510 }
1511 token = MOD;
1512 break;
1513 case CharacterOr:
1514 shift();
1515 if (m_current == '=') {
1516 shift();
1517 token = OREQUAL;
1518 break;
1519 }
1520 if (m_current == '|') {
1521 shift();
1522 token = OR;
1523 break;
1524 }
1525 token = BITOR;
1526 break;
1527 case CharacterOpenParen:
1528 token = OPENPAREN;
1529 shift();
1530 break;
1531 case CharacterCloseParen:
1532 token = CLOSEPAREN;
1533 shift();
1534 break;
1535 case CharacterOpenBracket:
1536 token = OPENBRACKET;
1537 shift();
1538 break;
1539 case CharacterCloseBracket:
1540 token = CLOSEBRACKET;
1541 shift();
1542 break;
1543 case CharacterComma:
1544 token = COMMA;
1545 shift();
1546 break;
1547 case CharacterColon:
1548 token = COLON;
1549 shift();
1550 break;
1551 case CharacterQuestion:
1552 token = QUESTION;
1553 shift();
1554 break;
1555 case CharacterTilde:
1556 token = TILDE;
1557 shift();
1558 break;
1559 case CharacterSemicolon:
1560 shift();
1561 token = SEMICOLON;
1562 break;
1563 case CharacterOpenBrace:
1564 tokenData->line = lineNumber();
1565 tokenData->offset = currentOffset();
1566 tokenData->lineStartOffset = currentLineStartOffset();
1567 ASSERT(tokenData->offset >= tokenData->lineStartOffset);
1568 shift();
1569 token = OPENBRACE;
1570 break;
1571 case CharacterCloseBrace:
1572 tokenData->line = lineNumber();
1573 tokenData->offset = currentOffset();
1574 tokenData->lineStartOffset = currentLineStartOffset();
1575 ASSERT(tokenData->offset >= tokenData->lineStartOffset);
1576 shift();
1577 token = CLOSEBRACE;
1578 break;
1579 case CharacterDot:
1580 shift();
1581 if (!isASCIIDigit(m_current)) {
1582 token = DOT;
1583 break;
1584 }
1585 goto inNumberAfterDecimalPoint;
1586 case CharacterZero:
1587 shift();
1588 if ((m_current | 0x20) == 'x' && isASCIIHexDigit(peek(1))) {
1589 parseHex(tokenData->doubleValue);
1590 token = NUMBER;
1591 } else {
1592 record8('0');
1593 if (isASCIIOctalDigit(m_current)) {
1594 if (parseOctal(tokenData->doubleValue)) {
1595 if (strictMode) {
1596 m_lexErrorMessage = "Octal escapes are forbidden in strict mode";
1597 token = INVALID_OCTAL_NUMBER_ERRORTOK;
1598 goto returnError;
1599 }
1600 token = NUMBER;
1601 }
1602 }
1603 }
1604 // Fall through into CharacterNumber
1605 case CharacterNumber:
1606 if (LIKELY(token != NUMBER)) {
1607 if (!parseDecimal(tokenData->doubleValue)) {
1608 if (m_current == '.') {
1609 shift();
1610 inNumberAfterDecimalPoint:
1611 parseNumberAfterDecimalPoint();
1612 }
1613 if ((m_current | 0x20) == 'e') {
1614 if (!parseNumberAfterExponentIndicator()) {
1615 m_lexErrorMessage = "Non-number found after exponent indicator";
1616 token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
1617 goto returnError;
1618 }
1619 }
1620 size_t parsedLength;
1621 tokenData->doubleValue = parseDouble(m_buffer8.data(), m_buffer8.size(), parsedLength);
1622 }
1623 token = NUMBER;
1624 }
1625
1626 // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
1627 if (UNLIKELY(isIdentStart(m_current))) {
1628 m_lexErrorMessage = "At least one digit must occur after a decimal point";
1629 token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
1630 goto returnError;
1631 }
1632 m_buffer8.resize(0);
1633 break;
1634 case CharacterQuote:
1635 if (lexerFlags & LexerFlagsDontBuildStrings) {
1636 StringParseResult result = parseString<false>(tokenData, strictMode);
1637 if (UNLIKELY(result != StringParsedSuccessfully)) {
1638 token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK;
1639 goto returnError;
1640 }
1641 } else {
1642 StringParseResult result = parseString<true>(tokenData, strictMode);
1643 if (UNLIKELY(result != StringParsedSuccessfully)) {
1644 token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK;
1645 goto returnError;
1646 }
1647 }
1648 shift();
1649 token = STRING;
1650 break;
1651 case CharacterIdentifierStart:
1652 ASSERT(isIdentStart(m_current));
1653 // Fall through into CharacterBackSlash.
1654 case CharacterBackSlash:
1655 if (lexerFlags & LexexFlagsDontBuildKeywords)
1656 token = parseIdentifier<false>(tokenData, lexerFlags, strictMode);
1657 else
1658 token = parseIdentifier<true>(tokenData, lexerFlags, strictMode);
1659 break;
1660 case CharacterLineTerminator:
1661 ASSERT(isLineTerminator(m_current));
1662 shiftLineTerminator();
1663 m_atLineStart = true;
1664 m_terminator = true;
1665 m_lineStart = m_code;
1666 goto start;
1667 case CharacterInvalid:
1668 m_lexErrorMessage = invalidCharacterMessage();
1669 token = ERRORTOK;
1670 goto returnError;
1671 default:
1672 RELEASE_ASSERT_NOT_REACHED();
1673 m_lexErrorMessage = "Internal Error";
1674 token = ERRORTOK;
1675 goto returnError;
1676 }
1677
1678 m_atLineStart = false;
1679 goto returnToken;
1680
1681 inSingleLineComment:
1682 while (!isLineTerminator(m_current)) {
1683 if (atEnd())
1684 return EOFTOK;
1685 shift();
1686 }
1687 shiftLineTerminator();
1688 m_atLineStart = true;
1689 m_terminator = true;
1690 m_lineStart = m_code;
1691 if (!lastTokenWasRestrKeyword())
1692 goto start;
1693
1694 token = SEMICOLON;
1695 // Fall through into returnToken.
1696
1697 returnToken:
1698 tokenLocation->line = m_lineNumber;
1699 tokenLocation->endOffset = currentOffset();
1700 tokenLocation->lineStartOffset = currentLineStartOffset();
1701 ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset);
1702 m_lastToken = token;
1703 return token;
1704
1705 returnError:
1706 m_error = true;
1707 tokenLocation->line = m_lineNumber;
1708 tokenLocation->endOffset = currentOffset();
1709 tokenLocation->lineStartOffset = currentLineStartOffset();
1710 ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset);
1711 RELEASE_ASSERT(token & ErrorTokenFlag);
1712 return token;
1713 }
1714
1715 template <typename T>
1716 static inline void orCharacter(UChar&, UChar);
1717
1718 template <>
1719 inline void orCharacter<LChar>(UChar&, UChar) { }
1720
1721 template <>
1722 inline void orCharacter<UChar>(UChar& orAccumulator, UChar character)
1723 {
1724 orAccumulator |= character;
1725 }
1726
1727 template <typename T>
1728 bool Lexer<T>::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
1729 {
1730 ASSERT(m_buffer16.isEmpty());
1731
1732 bool lastWasEscape = false;
1733 bool inBrackets = false;
1734 UChar charactersOredTogether = 0;
1735
1736 if (patternPrefix) {
1737 ASSERT(!isLineTerminator(patternPrefix));
1738 ASSERT(patternPrefix != '/');
1739 ASSERT(patternPrefix != '[');
1740 record16(patternPrefix);
1741 }
1742
1743 while (true) {
1744 if (isLineTerminator(m_current) || atEnd()) {
1745 m_buffer16.resize(0);
1746 return false;
1747 }
1748
1749 T prev = m_current;
1750
1751 shift();
1752
1753 if (prev == '/' && !lastWasEscape && !inBrackets)
1754 break;
1755
1756 record16(prev);
1757 orCharacter<T>(charactersOredTogether, prev);
1758
1759 if (lastWasEscape) {
1760 lastWasEscape = false;
1761 continue;
1762 }
1763
1764 switch (prev) {
1765 case '[':
1766 inBrackets = true;
1767 break;
1768 case ']':
1769 inBrackets = false;
1770 break;
1771 case '\\':
1772 lastWasEscape = true;
1773 break;
1774 }
1775 }
1776
1777 pattern = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
1778
1779 m_buffer16.resize(0);
1780 charactersOredTogether = 0;
1781
1782 while (isIdentPart(m_current)) {
1783 record16(m_current);
1784 orCharacter<T>(charactersOredTogether, m_current);
1785 shift();
1786 }
1787
1788 flags = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
1789 m_buffer16.resize(0);
1790
1791 return true;
1792 }
1793
1794 template <typename T>
1795 bool Lexer<T>::skipRegExp()
1796 {
1797 bool lastWasEscape = false;
1798 bool inBrackets = false;
1799
1800 while (true) {
1801 if (isLineTerminator(m_current) || atEnd())
1802 return false;
1803
1804 T prev = m_current;
1805
1806 shift();
1807
1808 if (prev == '/' && !lastWasEscape && !inBrackets)
1809 break;
1810
1811 if (lastWasEscape) {
1812 lastWasEscape = false;
1813 continue;
1814 }
1815
1816 switch (prev) {
1817 case '[':
1818 inBrackets = true;
1819 break;
1820 case ']':
1821 inBrackets = false;
1822 break;
1823 case '\\':
1824 lastWasEscape = true;
1825 break;
1826 }
1827 }
1828
1829 while (isIdentPart(m_current))
1830 shift();
1831
1832 return true;
1833 }
1834
1835 template <typename T>
1836 void Lexer<T>::clear()
1837 {
1838 m_arena = 0;
1839
1840 Vector<LChar> newBuffer8;
1841 m_buffer8.swap(newBuffer8);
1842
1843 Vector<UChar> newBuffer16;
1844 m_buffer16.swap(newBuffer16);
1845
1846 m_isReparsing = false;
1847 }
1848
1849 template <typename T>
1850 SourceCode Lexer<T>::sourceCode(int openBrace, int closeBrace, int firstLine, unsigned startColumn)
1851 {
1852 ASSERT(m_source->provider()->source()[openBrace] == '{');
1853 ASSERT(m_source->provider()->source()[closeBrace] == '}');
1854 return SourceCode(m_source->provider(), openBrace, closeBrace + 1, firstLine, startColumn);
1855 }
1856
1857 // Instantiate the two flavors of Lexer we need instead of putting most of this file in Lexer.h
1858 template class Lexer<LChar>;
1859 template class Lexer<UChar>;
1860
1861 } // namespace JSC