]> git.saurik.com Git - apple/javascriptcore.git/blob - parser/Lexer.cpp
JavaScriptCore-7600.1.4.15.12.tar.gz
[apple/javascriptcore.git] / parser / Lexer.cpp
1 /*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All Rights Reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
6 * Copyright (C) 2012 Mathias Bynens (mathias@qiwi.be)
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22 *
23 */
24
25 #include "config.h"
26 #include "Lexer.h"
27
28 #include "JSFunctionInlines.h"
29
30 #include "BuiltinNames.h"
31 #include "JSGlobalObjectFunctions.h"
32 #include "Identifier.h"
33 #include "NodeInfo.h"
34 #include "Nodes.h"
35 #include "JSCInlines.h"
36 #include <wtf/dtoa.h>
37 #include <ctype.h>
38 #include <limits.h>
39 #include <string.h>
40 #include <wtf/Assertions.h>
41
42 #include "KeywordLookup.h"
43 #include "Lexer.lut.h"
44 #include "Parser.h"
45
46 namespace JSC {
47
48 Keywords::Keywords(VM& vm)
49 : m_vm(vm)
50 , m_keywordTable(JSC::mainTable)
51 {
52 }
53
54 enum CharacterType {
55 // Types for the main switch
56
57 // The first three types are fixed, and also used for identifying
58 // ASCII alpha and alphanumeric characters (see isIdentStart and isIdentPart).
59 CharacterIdentifierStart,
60 CharacterZero,
61 CharacterNumber,
62
63 CharacterInvalid,
64 CharacterLineTerminator,
65 CharacterExclamationMark,
66 CharacterOpenParen,
67 CharacterCloseParen,
68 CharacterOpenBracket,
69 CharacterCloseBracket,
70 CharacterComma,
71 CharacterColon,
72 CharacterQuestion,
73 CharacterTilde,
74 CharacterQuote,
75 CharacterDot,
76 CharacterSlash,
77 CharacterBackSlash,
78 CharacterSemicolon,
79 CharacterOpenBrace,
80 CharacterCloseBrace,
81
82 CharacterAdd,
83 CharacterSub,
84 CharacterMultiply,
85 CharacterModulo,
86 CharacterAnd,
87 CharacterXor,
88 CharacterOr,
89 CharacterLess,
90 CharacterGreater,
91 CharacterEqual,
92
93 // Other types (only one so far)
94 CharacterWhiteSpace,
95 CharacterPrivateIdentifierStart
96 };
97
98 // 256 Latin-1 codes
99 static const unsigned short typesOfLatin1Characters[256] = {
100 /* 0 - Null */ CharacterInvalid,
101 /* 1 - Start of Heading */ CharacterInvalid,
102 /* 2 - Start of Text */ CharacterInvalid,
103 /* 3 - End of Text */ CharacterInvalid,
104 /* 4 - End of Transm. */ CharacterInvalid,
105 /* 5 - Enquiry */ CharacterInvalid,
106 /* 6 - Acknowledgment */ CharacterInvalid,
107 /* 7 - Bell */ CharacterInvalid,
108 /* 8 - Back Space */ CharacterInvalid,
109 /* 9 - Horizontal Tab */ CharacterWhiteSpace,
110 /* 10 - Line Feed */ CharacterLineTerminator,
111 /* 11 - Vertical Tab */ CharacterWhiteSpace,
112 /* 12 - Form Feed */ CharacterWhiteSpace,
113 /* 13 - Carriage Return */ CharacterLineTerminator,
114 /* 14 - Shift Out */ CharacterInvalid,
115 /* 15 - Shift In */ CharacterInvalid,
116 /* 16 - Data Line Escape */ CharacterInvalid,
117 /* 17 - Device Control 1 */ CharacterInvalid,
118 /* 18 - Device Control 2 */ CharacterInvalid,
119 /* 19 - Device Control 3 */ CharacterInvalid,
120 /* 20 - Device Control 4 */ CharacterInvalid,
121 /* 21 - Negative Ack. */ CharacterInvalid,
122 /* 22 - Synchronous Idle */ CharacterInvalid,
123 /* 23 - End of Transmit */ CharacterInvalid,
124 /* 24 - Cancel */ CharacterInvalid,
125 /* 25 - End of Medium */ CharacterInvalid,
126 /* 26 - Substitute */ CharacterInvalid,
127 /* 27 - Escape */ CharacterInvalid,
128 /* 28 - File Separator */ CharacterInvalid,
129 /* 29 - Group Separator */ CharacterInvalid,
130 /* 30 - Record Separator */ CharacterInvalid,
131 /* 31 - Unit Separator */ CharacterInvalid,
132 /* 32 - Space */ CharacterWhiteSpace,
133 /* 33 - ! */ CharacterExclamationMark,
134 /* 34 - " */ CharacterQuote,
135 /* 35 - # */ CharacterInvalid,
136 /* 36 - $ */ CharacterIdentifierStart,
137 /* 37 - % */ CharacterModulo,
138 /* 38 - & */ CharacterAnd,
139 /* 39 - ' */ CharacterQuote,
140 /* 40 - ( */ CharacterOpenParen,
141 /* 41 - ) */ CharacterCloseParen,
142 /* 42 - * */ CharacterMultiply,
143 /* 43 - + */ CharacterAdd,
144 /* 44 - , */ CharacterComma,
145 /* 45 - - */ CharacterSub,
146 /* 46 - . */ CharacterDot,
147 /* 47 - / */ CharacterSlash,
148 /* 48 - 0 */ CharacterZero,
149 /* 49 - 1 */ CharacterNumber,
150 /* 50 - 2 */ CharacterNumber,
151 /* 51 - 3 */ CharacterNumber,
152 /* 52 - 4 */ CharacterNumber,
153 /* 53 - 5 */ CharacterNumber,
154 /* 54 - 6 */ CharacterNumber,
155 /* 55 - 7 */ CharacterNumber,
156 /* 56 - 8 */ CharacterNumber,
157 /* 57 - 9 */ CharacterNumber,
158 /* 58 - : */ CharacterColon,
159 /* 59 - ; */ CharacterSemicolon,
160 /* 60 - < */ CharacterLess,
161 /* 61 - = */ CharacterEqual,
162 /* 62 - > */ CharacterGreater,
163 /* 63 - ? */ CharacterQuestion,
164 /* 64 - @ */ CharacterPrivateIdentifierStart,
165 /* 65 - A */ CharacterIdentifierStart,
166 /* 66 - B */ CharacterIdentifierStart,
167 /* 67 - C */ CharacterIdentifierStart,
168 /* 68 - D */ CharacterIdentifierStart,
169 /* 69 - E */ CharacterIdentifierStart,
170 /* 70 - F */ CharacterIdentifierStart,
171 /* 71 - G */ CharacterIdentifierStart,
172 /* 72 - H */ CharacterIdentifierStart,
173 /* 73 - I */ CharacterIdentifierStart,
174 /* 74 - J */ CharacterIdentifierStart,
175 /* 75 - K */ CharacterIdentifierStart,
176 /* 76 - L */ CharacterIdentifierStart,
177 /* 77 - M */ CharacterIdentifierStart,
178 /* 78 - N */ CharacterIdentifierStart,
179 /* 79 - O */ CharacterIdentifierStart,
180 /* 80 - P */ CharacterIdentifierStart,
181 /* 81 - Q */ CharacterIdentifierStart,
182 /* 82 - R */ CharacterIdentifierStart,
183 /* 83 - S */ CharacterIdentifierStart,
184 /* 84 - T */ CharacterIdentifierStart,
185 /* 85 - U */ CharacterIdentifierStart,
186 /* 86 - V */ CharacterIdentifierStart,
187 /* 87 - W */ CharacterIdentifierStart,
188 /* 88 - X */ CharacterIdentifierStart,
189 /* 89 - Y */ CharacterIdentifierStart,
190 /* 90 - Z */ CharacterIdentifierStart,
191 /* 91 - [ */ CharacterOpenBracket,
192 /* 92 - \ */ CharacterBackSlash,
193 /* 93 - ] */ CharacterCloseBracket,
194 /* 94 - ^ */ CharacterXor,
195 /* 95 - _ */ CharacterIdentifierStart,
196 /* 96 - ` */ CharacterInvalid,
197 /* 97 - a */ CharacterIdentifierStart,
198 /* 98 - b */ CharacterIdentifierStart,
199 /* 99 - c */ CharacterIdentifierStart,
200 /* 100 - d */ CharacterIdentifierStart,
201 /* 101 - e */ CharacterIdentifierStart,
202 /* 102 - f */ CharacterIdentifierStart,
203 /* 103 - g */ CharacterIdentifierStart,
204 /* 104 - h */ CharacterIdentifierStart,
205 /* 105 - i */ CharacterIdentifierStart,
206 /* 106 - j */ CharacterIdentifierStart,
207 /* 107 - k */ CharacterIdentifierStart,
208 /* 108 - l */ CharacterIdentifierStart,
209 /* 109 - m */ CharacterIdentifierStart,
210 /* 110 - n */ CharacterIdentifierStart,
211 /* 111 - o */ CharacterIdentifierStart,
212 /* 112 - p */ CharacterIdentifierStart,
213 /* 113 - q */ CharacterIdentifierStart,
214 /* 114 - r */ CharacterIdentifierStart,
215 /* 115 - s */ CharacterIdentifierStart,
216 /* 116 - t */ CharacterIdentifierStart,
217 /* 117 - u */ CharacterIdentifierStart,
218 /* 118 - v */ CharacterIdentifierStart,
219 /* 119 - w */ CharacterIdentifierStart,
220 /* 120 - x */ CharacterIdentifierStart,
221 /* 121 - y */ CharacterIdentifierStart,
222 /* 122 - z */ CharacterIdentifierStart,
223 /* 123 - { */ CharacterOpenBrace,
224 /* 124 - | */ CharacterOr,
225 /* 125 - } */ CharacterCloseBrace,
226 /* 126 - ~ */ CharacterTilde,
227 /* 127 - Delete */ CharacterInvalid,
228 /* 128 - Cc category */ CharacterInvalid,
229 /* 129 - Cc category */ CharacterInvalid,
230 /* 130 - Cc category */ CharacterInvalid,
231 /* 131 - Cc category */ CharacterInvalid,
232 /* 132 - Cc category */ CharacterInvalid,
233 /* 133 - Cc category */ CharacterInvalid,
234 /* 134 - Cc category */ CharacterInvalid,
235 /* 135 - Cc category */ CharacterInvalid,
236 /* 136 - Cc category */ CharacterInvalid,
237 /* 137 - Cc category */ CharacterInvalid,
238 /* 138 - Cc category */ CharacterInvalid,
239 /* 139 - Cc category */ CharacterInvalid,
240 /* 140 - Cc category */ CharacterInvalid,
241 /* 141 - Cc category */ CharacterInvalid,
242 /* 142 - Cc category */ CharacterInvalid,
243 /* 143 - Cc category */ CharacterInvalid,
244 /* 144 - Cc category */ CharacterInvalid,
245 /* 145 - Cc category */ CharacterInvalid,
246 /* 146 - Cc category */ CharacterInvalid,
247 /* 147 - Cc category */ CharacterInvalid,
248 /* 148 - Cc category */ CharacterInvalid,
249 /* 149 - Cc category */ CharacterInvalid,
250 /* 150 - Cc category */ CharacterInvalid,
251 /* 151 - Cc category */ CharacterInvalid,
252 /* 152 - Cc category */ CharacterInvalid,
253 /* 153 - Cc category */ CharacterInvalid,
254 /* 154 - Cc category */ CharacterInvalid,
255 /* 155 - Cc category */ CharacterInvalid,
256 /* 156 - Cc category */ CharacterInvalid,
257 /* 157 - Cc category */ CharacterInvalid,
258 /* 158 - Cc category */ CharacterInvalid,
259 /* 159 - Cc category */ CharacterInvalid,
260 /* 160 - Zs category (nbsp) */ CharacterWhiteSpace,
261 /* 161 - Po category */ CharacterInvalid,
262 /* 162 - Sc category */ CharacterInvalid,
263 /* 163 - Sc category */ CharacterInvalid,
264 /* 164 - Sc category */ CharacterInvalid,
265 /* 165 - Sc category */ CharacterInvalid,
266 /* 166 - So category */ CharacterInvalid,
267 /* 167 - So category */ CharacterInvalid,
268 /* 168 - Sk category */ CharacterInvalid,
269 /* 169 - So category */ CharacterInvalid,
270 /* 170 - Ll category */ CharacterIdentifierStart,
271 /* 171 - Pi category */ CharacterInvalid,
272 /* 172 - Sm category */ CharacterInvalid,
273 /* 173 - Cf category */ CharacterInvalid,
274 /* 174 - So category */ CharacterInvalid,
275 /* 175 - Sk category */ CharacterInvalid,
276 /* 176 - So category */ CharacterInvalid,
277 /* 177 - Sm category */ CharacterInvalid,
278 /* 178 - No category */ CharacterInvalid,
279 /* 179 - No category */ CharacterInvalid,
280 /* 180 - Sk category */ CharacterInvalid,
281 /* 181 - Ll category */ CharacterIdentifierStart,
282 /* 182 - So category */ CharacterInvalid,
283 /* 183 - Po category */ CharacterInvalid,
284 /* 184 - Sk category */ CharacterInvalid,
285 /* 185 - No category */ CharacterInvalid,
286 /* 186 - Ll category */ CharacterIdentifierStart,
287 /* 187 - Pf category */ CharacterInvalid,
288 /* 188 - No category */ CharacterInvalid,
289 /* 189 - No category */ CharacterInvalid,
290 /* 190 - No category */ CharacterInvalid,
291 /* 191 - Po category */ CharacterInvalid,
292 /* 192 - Lu category */ CharacterIdentifierStart,
293 /* 193 - Lu category */ CharacterIdentifierStart,
294 /* 194 - Lu category */ CharacterIdentifierStart,
295 /* 195 - Lu category */ CharacterIdentifierStart,
296 /* 196 - Lu category */ CharacterIdentifierStart,
297 /* 197 - Lu category */ CharacterIdentifierStart,
298 /* 198 - Lu category */ CharacterIdentifierStart,
299 /* 199 - Lu category */ CharacterIdentifierStart,
300 /* 200 - Lu category */ CharacterIdentifierStart,
301 /* 201 - Lu category */ CharacterIdentifierStart,
302 /* 202 - Lu category */ CharacterIdentifierStart,
303 /* 203 - Lu category */ CharacterIdentifierStart,
304 /* 204 - Lu category */ CharacterIdentifierStart,
305 /* 205 - Lu category */ CharacterIdentifierStart,
306 /* 206 - Lu category */ CharacterIdentifierStart,
307 /* 207 - Lu category */ CharacterIdentifierStart,
308 /* 208 - Lu category */ CharacterIdentifierStart,
309 /* 209 - Lu category */ CharacterIdentifierStart,
310 /* 210 - Lu category */ CharacterIdentifierStart,
311 /* 211 - Lu category */ CharacterIdentifierStart,
312 /* 212 - Lu category */ CharacterIdentifierStart,
313 /* 213 - Lu category */ CharacterIdentifierStart,
314 /* 214 - Lu category */ CharacterIdentifierStart,
315 /* 215 - Sm category */ CharacterInvalid,
316 /* 216 - Lu category */ CharacterIdentifierStart,
317 /* 217 - Lu category */ CharacterIdentifierStart,
318 /* 218 - Lu category */ CharacterIdentifierStart,
319 /* 219 - Lu category */ CharacterIdentifierStart,
320 /* 220 - Lu category */ CharacterIdentifierStart,
321 /* 221 - Lu category */ CharacterIdentifierStart,
322 /* 222 - Lu category */ CharacterIdentifierStart,
323 /* 223 - Ll category */ CharacterIdentifierStart,
324 /* 224 - Ll category */ CharacterIdentifierStart,
325 /* 225 - Ll category */ CharacterIdentifierStart,
326 /* 226 - Ll category */ CharacterIdentifierStart,
327 /* 227 - Ll category */ CharacterIdentifierStart,
328 /* 228 - Ll category */ CharacterIdentifierStart,
329 /* 229 - Ll category */ CharacterIdentifierStart,
330 /* 230 - Ll category */ CharacterIdentifierStart,
331 /* 231 - Ll category */ CharacterIdentifierStart,
332 /* 232 - Ll category */ CharacterIdentifierStart,
333 /* 233 - Ll category */ CharacterIdentifierStart,
334 /* 234 - Ll category */ CharacterIdentifierStart,
335 /* 235 - Ll category */ CharacterIdentifierStart,
336 /* 236 - Ll category */ CharacterIdentifierStart,
337 /* 237 - Ll category */ CharacterIdentifierStart,
338 /* 238 - Ll category */ CharacterIdentifierStart,
339 /* 239 - Ll category */ CharacterIdentifierStart,
340 /* 240 - Ll category */ CharacterIdentifierStart,
341 /* 241 - Ll category */ CharacterIdentifierStart,
342 /* 242 - Ll category */ CharacterIdentifierStart,
343 /* 243 - Ll category */ CharacterIdentifierStart,
344 /* 244 - Ll category */ CharacterIdentifierStart,
345 /* 245 - Ll category */ CharacterIdentifierStart,
346 /* 246 - Ll category */ CharacterIdentifierStart,
347 /* 247 - Sm category */ CharacterInvalid,
348 /* 248 - Ll category */ CharacterIdentifierStart,
349 /* 249 - Ll category */ CharacterIdentifierStart,
350 /* 250 - Ll category */ CharacterIdentifierStart,
351 /* 251 - Ll category */ CharacterIdentifierStart,
352 /* 252 - Ll category */ CharacterIdentifierStart,
353 /* 253 - Ll category */ CharacterIdentifierStart,
354 /* 254 - Ll category */ CharacterIdentifierStart,
355 /* 255 - Ll category */ CharacterIdentifierStart
356 };
357
358 // This table provides the character that results from \X where X is the index in the table beginning
359 // with SPACE. A table value of 0 means that more processing needs to be done.
360 static const LChar singleCharacterEscapeValuesForASCII[128] = {
361 /* 0 - Null */ 0,
362 /* 1 - Start of Heading */ 0,
363 /* 2 - Start of Text */ 0,
364 /* 3 - End of Text */ 0,
365 /* 4 - End of Transm. */ 0,
366 /* 5 - Enquiry */ 0,
367 /* 6 - Acknowledgment */ 0,
368 /* 7 - Bell */ 0,
369 /* 8 - Back Space */ 0,
370 /* 9 - Horizontal Tab */ 0,
371 /* 10 - Line Feed */ 0,
372 /* 11 - Vertical Tab */ 0,
373 /* 12 - Form Feed */ 0,
374 /* 13 - Carriage Return */ 0,
375 /* 14 - Shift Out */ 0,
376 /* 15 - Shift In */ 0,
377 /* 16 - Data Line Escape */ 0,
378 /* 17 - Device Control 1 */ 0,
379 /* 18 - Device Control 2 */ 0,
380 /* 19 - Device Control 3 */ 0,
381 /* 20 - Device Control 4 */ 0,
382 /* 21 - Negative Ack. */ 0,
383 /* 22 - Synchronous Idle */ 0,
384 /* 23 - End of Transmit */ 0,
385 /* 24 - Cancel */ 0,
386 /* 25 - End of Medium */ 0,
387 /* 26 - Substitute */ 0,
388 /* 27 - Escape */ 0,
389 /* 28 - File Separator */ 0,
390 /* 29 - Group Separator */ 0,
391 /* 30 - Record Separator */ 0,
392 /* 31 - Unit Separator */ 0,
393 /* 32 - Space */ ' ',
394 /* 33 - ! */ '!',
395 /* 34 - " */ '"',
396 /* 35 - # */ '#',
397 /* 36 - $ */ '$',
398 /* 37 - % */ '%',
399 /* 38 - & */ '&',
400 /* 39 - ' */ '\'',
401 /* 40 - ( */ '(',
402 /* 41 - ) */ ')',
403 /* 42 - * */ '*',
404 /* 43 - + */ '+',
405 /* 44 - , */ ',',
406 /* 45 - - */ '-',
407 /* 46 - . */ '.',
408 /* 47 - / */ '/',
409 /* 48 - 0 */ 0,
410 /* 49 - 1 */ 0,
411 /* 50 - 2 */ 0,
412 /* 51 - 3 */ 0,
413 /* 52 - 4 */ 0,
414 /* 53 - 5 */ 0,
415 /* 54 - 6 */ 0,
416 /* 55 - 7 */ 0,
417 /* 56 - 8 */ 0,
418 /* 57 - 9 */ 0,
419 /* 58 - : */ ':',
420 /* 59 - ; */ ';',
421 /* 60 - < */ '<',
422 /* 61 - = */ '=',
423 /* 62 - > */ '>',
424 /* 63 - ? */ '?',
425 /* 64 - @ */ '@',
426 /* 65 - A */ 'A',
427 /* 66 - B */ 'B',
428 /* 67 - C */ 'C',
429 /* 68 - D */ 'D',
430 /* 69 - E */ 'E',
431 /* 70 - F */ 'F',
432 /* 71 - G */ 'G',
433 /* 72 - H */ 'H',
434 /* 73 - I */ 'I',
435 /* 74 - J */ 'J',
436 /* 75 - K */ 'K',
437 /* 76 - L */ 'L',
438 /* 77 - M */ 'M',
439 /* 78 - N */ 'N',
440 /* 79 - O */ 'O',
441 /* 80 - P */ 'P',
442 /* 81 - Q */ 'Q',
443 /* 82 - R */ 'R',
444 /* 83 - S */ 'S',
445 /* 84 - T */ 'T',
446 /* 85 - U */ 'U',
447 /* 86 - V */ 'V',
448 /* 87 - W */ 'W',
449 /* 88 - X */ 'X',
450 /* 89 - Y */ 'Y',
451 /* 90 - Z */ 'Z',
452 /* 91 - [ */ '[',
453 /* 92 - \ */ '\\',
454 /* 93 - ] */ ']',
455 /* 94 - ^ */ '^',
456 /* 95 - _ */ '_',
457 /* 96 - ` */ '`',
458 /* 97 - a */ 'a',
459 /* 98 - b */ 0x08,
460 /* 99 - c */ 'c',
461 /* 100 - d */ 'd',
462 /* 101 - e */ 'e',
463 /* 102 - f */ 0x0C,
464 /* 103 - g */ 'g',
465 /* 104 - h */ 'h',
466 /* 105 - i */ 'i',
467 /* 106 - j */ 'j',
468 /* 107 - k */ 'k',
469 /* 108 - l */ 'l',
470 /* 109 - m */ 'm',
471 /* 110 - n */ 0x0A,
472 /* 111 - o */ 'o',
473 /* 112 - p */ 'p',
474 /* 113 - q */ 'q',
475 /* 114 - r */ 0x0D,
476 /* 115 - s */ 's',
477 /* 116 - t */ 0x09,
478 /* 117 - u */ 0,
479 /* 118 - v */ 0x0B,
480 /* 119 - w */ 'w',
481 /* 120 - x */ 0,
482 /* 121 - y */ 'y',
483 /* 122 - z */ 'z',
484 /* 123 - { */ '{',
485 /* 124 - | */ '|',
486 /* 125 - } */ '}',
487 /* 126 - ~ */ '~',
488 /* 127 - Delete */ 0
489 };
490
491 template <typename T>
492 Lexer<T>::Lexer(VM* vm, JSParserStrictness strictness)
493 : m_isReparsing(false)
494 , m_vm(vm)
495 , m_parsingBuiltinFunction(strictness == JSParseBuiltin)
496 {
497 }
498
499 template <typename T>
500 Lexer<T>::~Lexer()
501 {
502 }
503
504 template <typename T>
505 String Lexer<T>::invalidCharacterMessage() const
506 {
507 switch (m_current) {
508 case 0:
509 return "Invalid character: '\\0'";
510 case 10:
511 return "Invalid character: '\\n'";
512 case 11:
513 return "Invalid character: '\\v'";
514 case 13:
515 return "Invalid character: '\\r'";
516 case 35:
517 return "Invalid character: '#'";
518 case 64:
519 return "Invalid character: '@'";
520 case 96:
521 return "Invalid character: '`'";
522 default:
523 return String::format("Invalid character '\\u%04u'", static_cast<unsigned>(m_current)).impl();
524 }
525 }
526
527 template <typename T>
528 ALWAYS_INLINE const T* Lexer<T>::currentSourcePtr() const
529 {
530 ASSERT(m_code <= m_codeEnd);
531 return m_code;
532 }
533
534 template <typename T>
535 void Lexer<T>::setCode(const SourceCode& source, ParserArena* arena)
536 {
537 m_arena = &arena->identifierArena();
538
539 m_lineNumber = source.firstLine();
540 m_lastToken = -1;
541
542 const String& sourceString = source.provider()->source();
543
544 if (!sourceString.isNull())
545 setCodeStart(sourceString.impl());
546 else
547 m_codeStart = 0;
548
549 m_source = &source;
550 m_sourceOffset = source.startOffset();
551 m_codeStartPlusOffset = m_codeStart + source.startOffset();
552 m_code = m_codeStartPlusOffset;
553 m_codeEnd = m_codeStart + source.endOffset();
554 m_error = false;
555 m_atLineStart = true;
556 m_lineStart = m_code;
557 m_lexErrorMessage = String();
558
559 m_buffer8.reserveInitialCapacity(initialReadBufferCapacity);
560 m_buffer16.reserveInitialCapacity((m_codeEnd - m_code) / 2);
561
562 if (LIKELY(m_code < m_codeEnd))
563 m_current = *m_code;
564 else
565 m_current = 0;
566 ASSERT(currentOffset() == source.startOffset());
567 }
568
569 template <typename T>
570 template <int shiftAmount> ALWAYS_INLINE void Lexer<T>::internalShift()
571 {
572 m_code += shiftAmount;
573 ASSERT(currentOffset() >= currentLineStartOffset());
574 m_current = *m_code;
575 }
576
577 template <typename T>
578 ALWAYS_INLINE void Lexer<T>::shift()
579 {
580 // At one point timing showed that setting m_current to 0 unconditionally was faster than an if-else sequence.
581 m_current = 0;
582 ++m_code;
583 if (LIKELY(m_code < m_codeEnd))
584 m_current = *m_code;
585 }
586
587 template <typename T>
588 ALWAYS_INLINE bool Lexer<T>::atEnd() const
589 {
590 ASSERT(!m_current || m_code < m_codeEnd);
591 return UNLIKELY(UNLIKELY(!m_current) && m_code == m_codeEnd);
592 }
593
594 template <typename T>
595 ALWAYS_INLINE T Lexer<T>::peek(int offset) const
596 {
597 ASSERT(offset > 0 && offset < 5);
598 const T* code = m_code + offset;
599 return (code < m_codeEnd) ? *code : 0;
600 }
601
602 template <typename T>
603 typename Lexer<T>::UnicodeHexValue Lexer<T>::parseFourDigitUnicodeHex()
604 {
605 T char1 = peek(1);
606 T char2 = peek(2);
607 T char3 = peek(3);
608
609 if (UNLIKELY(!isASCIIHexDigit(m_current) || !isASCIIHexDigit(char1) || !isASCIIHexDigit(char2) || !isASCIIHexDigit(char3)))
610 return UnicodeHexValue((m_code + 4) >= m_codeEnd ? UnicodeHexValue::IncompleteHex : UnicodeHexValue::InvalidHex);
611
612 int result = convertUnicode(m_current, char1, char2, char3);
613 shift();
614 shift();
615 shift();
616 shift();
617 return UnicodeHexValue(result);
618 }
619
620 template <typename T>
621 void Lexer<T>::shiftLineTerminator()
622 {
623 ASSERT(isLineTerminator(m_current));
624
625 m_positionBeforeLastNewline = currentPosition();
626 T prev = m_current;
627 shift();
628
629 // Allow both CRLF and LFCR.
630 if (prev + m_current == '\n' + '\r')
631 shift();
632
633 ++m_lineNumber;
634 }
635
636 template <typename T>
637 ALWAYS_INLINE bool Lexer<T>::lastTokenWasRestrKeyword() const
638 {
639 return m_lastToken == CONTINUE || m_lastToken == BREAK || m_lastToken == RETURN || m_lastToken == THROW;
640 }
641
642 static NEVER_INLINE bool isNonLatin1IdentStart(UChar c)
643 {
644 return U_GET_GC_MASK(c) & U_GC_L_MASK;
645 }
646
647 static ALWAYS_INLINE bool isLatin1(LChar)
648 {
649 return true;
650 }
651
652 static ALWAYS_INLINE bool isLatin1(UChar c)
653 {
654 return c < 256;
655 }
656
657 static inline bool isIdentStart(LChar c)
658 {
659 return typesOfLatin1Characters[c] == CharacterIdentifierStart;
660 }
661
662 static inline bool isIdentStart(UChar c)
663 {
664 return isLatin1(c) ? isIdentStart(static_cast<LChar>(c)) : isNonLatin1IdentStart(c);
665 }
666
667 static NEVER_INLINE bool isNonLatin1IdentPart(int c)
668 {
669 return (U_GET_GC_MASK(c) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || c == 0x200C || c == 0x200D;
670 }
671
672 static ALWAYS_INLINE bool isIdentPart(LChar c)
673 {
674 // Character types are divided into two groups depending on whether they can be part of an
675 // identifier or not. Those whose type value is less or equal than CharacterNumber can be
676 // part of an identifier. (See the CharacterType definition for more details.)
677 return typesOfLatin1Characters[c] <= CharacterNumber;
678 }
679
680 static ALWAYS_INLINE bool isIdentPart(UChar c)
681 {
682 return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
683 }
684
685 static inline LChar singleEscape(int c)
686 {
687 if (c < 128) {
688 ASSERT(static_cast<size_t>(c) < ARRAY_SIZE(singleCharacterEscapeValuesForASCII));
689 return singleCharacterEscapeValuesForASCII[c];
690 }
691 return 0;
692 }
693
694 template <typename T>
695 inline void Lexer<T>::record8(int c)
696 {
697 ASSERT(c >= 0);
698 ASSERT(c <= 0xFF);
699 m_buffer8.append(static_cast<LChar>(c));
700 }
701
702 template <typename T>
703 inline void assertCharIsIn8BitRange(T c)
704 {
705 UNUSED_PARAM(c);
706 ASSERT(c >= 0);
707 ASSERT(c <= 0xFF);
708 }
709
710 template <>
711 inline void assertCharIsIn8BitRange(UChar c)
712 {
713 UNUSED_PARAM(c);
714 ASSERT(c <= 0xFF);
715 }
716
717 template <>
718 inline void assertCharIsIn8BitRange(LChar)
719 {
720 }
721
722 template <typename T>
723 inline void Lexer<T>::append8(const T* p, size_t length)
724 {
725 size_t currentSize = m_buffer8.size();
726 m_buffer8.grow(currentSize + length);
727 LChar* rawBuffer = m_buffer8.data() + currentSize;
728
729 for (size_t i = 0; i < length; i++) {
730 T c = p[i];
731 assertCharIsIn8BitRange(c);
732 rawBuffer[i] = c;
733 }
734 }
735
736 template <typename T>
737 inline void Lexer<T>::append16(const LChar* p, size_t length)
738 {
739 size_t currentSize = m_buffer16.size();
740 m_buffer16.grow(currentSize + length);
741 UChar* rawBuffer = m_buffer16.data() + currentSize;
742
743 for (size_t i = 0; i < length; i++)
744 rawBuffer[i] = p[i];
745 }
746
747 template <typename T>
748 inline void Lexer<T>::record16(T c)
749 {
750 m_buffer16.append(c);
751 }
752
753 template <typename T>
754 inline void Lexer<T>::record16(int c)
755 {
756 ASSERT(c >= 0);
757 ASSERT(c <= static_cast<int>(USHRT_MAX));
758 m_buffer16.append(static_cast<UChar>(c));
759 }
760
761 #if !ASSERT_DISABLED
762 bool isSafeBuiltinIdentifier(VM& vm, const Identifier* ident)
763 {
764 if (!ident)
765 return true;
766 /* Just block any use of suspicious identifiers. This is intended to
767 * be used as a safety net while implementing builtins.
768 */
769 if (*ident == vm.propertyNames->builtinNames().callPublicName())
770 return false;
771 if (*ident == vm.propertyNames->builtinNames().applyPublicName())
772 return false;
773 if (*ident == vm.propertyNames->eval)
774 return false;
775 if (*ident == vm.propertyNames->Function)
776 return false;
777 return true;
778 }
779 #endif
780
781 template <>
782 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<LChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
783 {
784 const ptrdiff_t remaining = m_codeEnd - m_code;
785 if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
786 JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
787 if (keyword != IDENT) {
788 ASSERT((!shouldCreateIdentifier) || tokenData->ident);
789 return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
790 }
791 }
792
793 bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction;
794 if (isPrivateName)
795 shift();
796
797 const LChar* identifierStart = currentSourcePtr();
798 unsigned identifierLineStart = currentLineStartOffset();
799
800 while (isIdentPart(m_current))
801 shift();
802
803 if (UNLIKELY(m_current == '\\')) {
804 setOffsetFromSourcePtr(identifierStart, identifierLineStart);
805 return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
806 }
807
808 const Identifier* ident = 0;
809
810 if (shouldCreateIdentifier || m_parsingBuiltinFunction) {
811 int identifierLength = currentSourcePtr() - identifierStart;
812 ident = makeIdentifier(identifierStart, identifierLength);
813 if (m_parsingBuiltinFunction) {
814 if (!isSafeBuiltinIdentifier(*m_vm, ident) && !isPrivateName) {
815 m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions.");
816 return ERRORTOK;
817 }
818 if (isPrivateName)
819 ident = m_vm->propertyNames->getPrivateName(*ident);
820 else if (*ident == m_vm->propertyNames->undefinedKeyword)
821 tokenData->ident = &m_vm->propertyNames->undefinedPrivateName;
822 if (!ident)
823 return INVALID_PRIVATE_NAME_ERRORTOK;
824 }
825 tokenData->ident = ident;
826 } else
827 tokenData->ident = 0;
828
829 if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) && !isPrivateName) {
830 ASSERT(shouldCreateIdentifier);
831 if (remaining < maxTokenLength) {
832 const HashTableValue* entry = m_vm->keywords->getKeyword(*ident);
833 ASSERT((remaining < maxTokenLength) || !entry);
834 if (!entry)
835 return IDENT;
836 JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
837 return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
838 }
839 return IDENT;
840 }
841
842 return IDENT;
843 }
844
845 template <>
846 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<UChar>::parseIdentifier(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
847 {
848 const ptrdiff_t remaining = m_codeEnd - m_code;
849 if ((remaining >= maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) {
850 JSTokenType keyword = parseKeyword<shouldCreateIdentifier>(tokenData);
851 if (keyword != IDENT) {
852 ASSERT((!shouldCreateIdentifier) || tokenData->ident);
853 return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
854 }
855 }
856
857 bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction;
858 if (isPrivateName)
859 shift();
860
861 const UChar* identifierStart = currentSourcePtr();
862 int identifierLineStart = currentLineStartOffset();
863
864 UChar orAllChars = 0;
865
866 while (isIdentPart(m_current)) {
867 orAllChars |= m_current;
868 shift();
869 }
870
871 if (UNLIKELY(m_current == '\\')) {
872 ASSERT(!isPrivateName);
873 setOffsetFromSourcePtr(identifierStart, identifierLineStart);
874 return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
875 }
876
877 bool isAll8Bit = false;
878
879 if (!(orAllChars & ~0xff))
880 isAll8Bit = true;
881
882 const Identifier* ident = 0;
883
884 if (shouldCreateIdentifier || m_parsingBuiltinFunction) {
885 int identifierLength = currentSourcePtr() - identifierStart;
886 if (isAll8Bit)
887 ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
888 else
889 ident = makeIdentifier(identifierStart, identifierLength);
890 if (m_parsingBuiltinFunction) {
891 if (!isSafeBuiltinIdentifier(*m_vm, ident) && !isPrivateName) {
892 m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions.");
893 return ERRORTOK;
894 }
895 if (isPrivateName)
896 ident = m_vm->propertyNames->getPrivateName(*ident);
897 else if (*ident == m_vm->propertyNames->undefinedKeyword)
898 tokenData->ident = &m_vm->propertyNames->undefinedPrivateName;
899 if (!ident)
900 return INVALID_PRIVATE_NAME_ERRORTOK;
901 }
902 tokenData->ident = ident;
903 } else
904 tokenData->ident = 0;
905
906 if (UNLIKELY((remaining < maxTokenLength) && !(lexerFlags & LexerFlagsIgnoreReservedWords)) && !isPrivateName) {
907 ASSERT(shouldCreateIdentifier);
908 if (remaining < maxTokenLength) {
909 const HashTableValue* entry = m_vm->keywords->getKeyword(*ident);
910 ASSERT((remaining < maxTokenLength) || !entry);
911 if (!entry)
912 return IDENT;
913 JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
914 return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
915 }
916 return IDENT;
917 }
918
919 return IDENT;
920 }
921
922 template <typename T>
923 template <bool shouldCreateIdentifier> JSTokenType Lexer<T>::parseIdentifierSlowCase(JSTokenData* tokenData, unsigned lexerFlags, bool strictMode)
924 {
925 const ptrdiff_t remaining = m_codeEnd - m_code;
926 const T* identifierStart = currentSourcePtr();
927 bool bufferRequired = false;
928
929 while (true) {
930 if (LIKELY(isIdentPart(m_current))) {
931 shift();
932 continue;
933 }
934 if (LIKELY(m_current != '\\'))
935 break;
936
937 // \uXXXX unicode characters.
938 bufferRequired = true;
939 if (identifierStart != currentSourcePtr())
940 m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
941 shift();
942 if (UNLIKELY(m_current != 'u'))
943 return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
944 shift();
945 UnicodeHexValue character = parseFourDigitUnicodeHex();
946 if (UNLIKELY(!character.isValid()))
947 return character.valueType() == UnicodeHexValue::IncompleteHex ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
948 UChar ucharacter = static_cast<UChar>(character.value());
949 if (UNLIKELY(m_buffer16.size() ? !isIdentPart(ucharacter) : !isIdentStart(ucharacter)))
950 return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
951 if (shouldCreateIdentifier)
952 record16(ucharacter);
953 identifierStart = currentSourcePtr();
954 }
955
956 int identifierLength;
957 const Identifier* ident = 0;
958 if (shouldCreateIdentifier) {
959 if (!bufferRequired) {
960 identifierLength = currentSourcePtr() - identifierStart;
961 ident = makeIdentifier(identifierStart, identifierLength);
962 } else {
963 if (identifierStart != currentSourcePtr())
964 m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
965 ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
966 }
967
968 tokenData->ident = ident;
969 } else
970 tokenData->ident = 0;
971
972 if (LIKELY(!bufferRequired && !(lexerFlags & LexerFlagsIgnoreReservedWords))) {
973 ASSERT(shouldCreateIdentifier);
974 // Keywords must not be recognized if there was an \uXXXX in the identifier.
975 if (remaining < maxTokenLength) {
976 const HashTableValue* entry = m_vm->keywords->getKeyword(*ident);
977 ASSERT((remaining < maxTokenLength) || !entry);
978 if (!entry)
979 return IDENT;
980 JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
981 return (token != RESERVED_IF_STRICT) || strictMode ? token : IDENT;
982 }
983 return IDENT;
984 }
985
986 m_buffer16.resize(0);
987 return IDENT;
988 }
989
990 static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(LChar character)
991 {
992 return character < 0xE;
993 }
994
995 static ALWAYS_INLINE bool characterRequiresParseStringSlowCase(UChar character)
996 {
997 return character < 0xE || character > 0xFF;
998 }
999
1000 template <typename T>
1001 template <bool shouldBuildStrings> ALWAYS_INLINE typename Lexer<T>::StringParseResult Lexer<T>::parseString(JSTokenData* tokenData, bool strictMode)
1002 {
1003 int startingOffset = currentOffset();
1004 int startingLineStartOffset = currentLineStartOffset();
1005 int startingLineNumber = lineNumber();
1006 T stringQuoteCharacter = m_current;
1007 shift();
1008
1009 const T* stringStart = currentSourcePtr();
1010
1011 while (m_current != stringQuoteCharacter) {
1012 if (UNLIKELY(m_current == '\\')) {
1013 if (stringStart != currentSourcePtr() && shouldBuildStrings)
1014 append8(stringStart, currentSourcePtr() - stringStart);
1015 shift();
1016
1017 LChar escape = singleEscape(m_current);
1018
1019 // Most common escape sequences first
1020 if (escape) {
1021 if (shouldBuildStrings)
1022 record8(escape);
1023 shift();
1024 } else if (UNLIKELY(isLineTerminator(m_current)))
1025 shiftLineTerminator();
1026 else if (m_current == 'x') {
1027 shift();
1028 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
1029 m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
1030 return (atEnd() || (isASCIIHexDigit(m_current) && (m_code + 1 == m_codeEnd))) ? StringUnterminated : StringCannotBeParsed;
1031 }
1032 T prev = m_current;
1033 shift();
1034 if (shouldBuildStrings)
1035 record8(convertHex(prev, m_current));
1036 shift();
1037 } else {
1038 setOffset(startingOffset, startingLineStartOffset);
1039 setLineNumber(startingLineNumber);
1040 m_buffer8.resize(0);
1041 return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
1042 }
1043 stringStart = currentSourcePtr();
1044 continue;
1045 }
1046
1047 if (UNLIKELY(characterRequiresParseStringSlowCase(m_current))) {
1048 setOffset(startingOffset, startingLineStartOffset);
1049 setLineNumber(startingLineNumber);
1050 m_buffer8.resize(0);
1051 return parseStringSlowCase<shouldBuildStrings>(tokenData, strictMode);
1052 }
1053
1054 shift();
1055 }
1056
1057 if (currentSourcePtr() != stringStart && shouldBuildStrings)
1058 append8(stringStart, currentSourcePtr() - stringStart);
1059 if (shouldBuildStrings) {
1060 tokenData->ident = makeIdentifier(m_buffer8.data(), m_buffer8.size());
1061 m_buffer8.resize(0);
1062 } else
1063 tokenData->ident = 0;
1064
1065 return StringParsedSuccessfully;
1066 }
1067
1068 template <typename T>
1069 template <bool shouldBuildStrings> typename Lexer<T>::StringParseResult Lexer<T>::parseStringSlowCase(JSTokenData* tokenData, bool strictMode)
1070 {
1071 T stringQuoteCharacter = m_current;
1072 shift();
1073
1074 const T* stringStart = currentSourcePtr();
1075
1076 while (m_current != stringQuoteCharacter) {
1077 if (UNLIKELY(m_current == '\\')) {
1078 if (stringStart != currentSourcePtr() && shouldBuildStrings)
1079 append16(stringStart, currentSourcePtr() - stringStart);
1080 shift();
1081
1082 LChar escape = singleEscape(m_current);
1083
1084 // Most common escape sequences first
1085 if (escape) {
1086 if (shouldBuildStrings)
1087 record16(escape);
1088 shift();
1089 } else if (UNLIKELY(isLineTerminator(m_current)))
1090 shiftLineTerminator();
1091 else if (m_current == 'x') {
1092 shift();
1093 if (!isASCIIHexDigit(m_current) || !isASCIIHexDigit(peek(1))) {
1094 m_lexErrorMessage = "\\x can only be followed by a hex character sequence";
1095 return StringCannotBeParsed;
1096 }
1097 T prev = m_current;
1098 shift();
1099 if (shouldBuildStrings)
1100 record16(convertHex(prev, m_current));
1101 shift();
1102 } else if (m_current == 'u') {
1103 shift();
1104 UnicodeHexValue character = parseFourDigitUnicodeHex();
1105 if (character.isValid()) {
1106 if (shouldBuildStrings)
1107 record16(character.value());
1108 } else if (m_current == stringQuoteCharacter) {
1109 if (shouldBuildStrings)
1110 record16('u');
1111 } else {
1112 m_lexErrorMessage = "\\u can only be followed by a Unicode character sequence";
1113 return character.valueType() == UnicodeHexValue::IncompleteHex ? StringUnterminated : StringCannotBeParsed;
1114 }
1115 } else if (strictMode && isASCIIDigit(m_current)) {
1116 // The only valid numeric escape in strict mode is '\0', and this must not be followed by a decimal digit.
1117 int character1 = m_current;
1118 shift();
1119 if (character1 != '0' || isASCIIDigit(m_current)) {
1120 m_lexErrorMessage = "The only valid numeric escape in strict mode is '\\0'";
1121 return StringCannotBeParsed;
1122 }
1123 if (shouldBuildStrings)
1124 record16(0);
1125 } else if (!strictMode && isASCIIOctalDigit(m_current)) {
1126 // Octal character sequences
1127 T character1 = m_current;
1128 shift();
1129 if (isASCIIOctalDigit(m_current)) {
1130 // Two octal characters
1131 T character2 = m_current;
1132 shift();
1133 if (character1 >= '0' && character1 <= '3' && isASCIIOctalDigit(m_current)) {
1134 if (shouldBuildStrings)
1135 record16((character1 - '0') * 64 + (character2 - '0') * 8 + m_current - '0');
1136 shift();
1137 } else {
1138 if (shouldBuildStrings)
1139 record16((character1 - '0') * 8 + character2 - '0');
1140 }
1141 } else {
1142 if (shouldBuildStrings)
1143 record16(character1 - '0');
1144 }
1145 } else if (!atEnd()) {
1146 if (shouldBuildStrings)
1147 record16(m_current);
1148 shift();
1149 } else {
1150 m_lexErrorMessage = "Unterminated string constant";
1151 return StringUnterminated;
1152 }
1153
1154 stringStart = currentSourcePtr();
1155 continue;
1156 }
1157 // Fast check for characters that require special handling.
1158 // Catches 0, \n, \r, 0x2028, and 0x2029 as efficiently
1159 // as possible, and lets through all common ASCII characters.
1160 if (UNLIKELY(((static_cast<unsigned>(m_current) - 0xE) & 0x2000))) {
1161 // New-line or end of input is not allowed
1162 if (atEnd() || isLineTerminator(m_current)) {
1163 m_lexErrorMessage = "Unexpected EOF";
1164 return atEnd() ? StringUnterminated : StringCannotBeParsed;
1165 }
1166 // Anything else is just a normal character
1167 }
1168 shift();
1169 }
1170
1171 if (currentSourcePtr() != stringStart && shouldBuildStrings)
1172 append16(stringStart, currentSourcePtr() - stringStart);
1173 if (shouldBuildStrings)
1174 tokenData->ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
1175 else
1176 tokenData->ident = 0;
1177
1178 m_buffer16.resize(0);
1179 return StringParsedSuccessfully;
1180 }
1181
1182 template <typename T>
1183 ALWAYS_INLINE void Lexer<T>::parseHex(double& returnValue)
1184 {
1185 // Optimization: most hexadecimal values fit into 4 bytes.
1186 uint32_t hexValue = 0;
1187 int maximumDigits = 7;
1188
1189 // Shift out the 'x' prefix.
1190 shift();
1191
1192 do {
1193 hexValue = (hexValue << 4) + toASCIIHexValue(m_current);
1194 shift();
1195 --maximumDigits;
1196 } while (isASCIIHexDigit(m_current) && maximumDigits >= 0);
1197
1198 if (maximumDigits >= 0) {
1199 returnValue = hexValue;
1200 return;
1201 }
1202
1203 // No more place in the hexValue buffer.
1204 // The values are shifted out and placed into the m_buffer8 vector.
1205 for (int i = 0; i < 8; ++i) {
1206 int digit = hexValue >> 28;
1207 if (digit < 10)
1208 record8(digit + '0');
1209 else
1210 record8(digit - 10 + 'a');
1211 hexValue <<= 4;
1212 }
1213
1214 while (isASCIIHexDigit(m_current)) {
1215 record8(m_current);
1216 shift();
1217 }
1218
1219 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 16);
1220 }
1221
1222 template <typename T>
1223 ALWAYS_INLINE bool Lexer<T>::parseOctal(double& returnValue)
1224 {
1225 // Optimization: most octal values fit into 4 bytes.
1226 uint32_t octalValue = 0;
1227 int maximumDigits = 9;
1228 // Temporary buffer for the digits. Makes easier
1229 // to reconstruct the input characters when needed.
1230 LChar digits[10];
1231
1232 do {
1233 octalValue = octalValue * 8 + (m_current - '0');
1234 digits[maximumDigits] = m_current;
1235 shift();
1236 --maximumDigits;
1237 } while (isASCIIOctalDigit(m_current) && maximumDigits >= 0);
1238
1239 if (!isASCIIDigit(m_current) && maximumDigits >= 0) {
1240 returnValue = octalValue;
1241 return true;
1242 }
1243
1244 for (int i = 9; i > maximumDigits; --i)
1245 record8(digits[i]);
1246
1247 while (isASCIIOctalDigit(m_current)) {
1248 record8(m_current);
1249 shift();
1250 }
1251
1252 if (isASCIIDigit(m_current))
1253 return false;
1254
1255 returnValue = parseIntOverflow(m_buffer8.data(), m_buffer8.size(), 8);
1256 return true;
1257 }
1258
1259 template <typename T>
1260 ALWAYS_INLINE bool Lexer<T>::parseDecimal(double& returnValue)
1261 {
1262 // Optimization: most decimal values fit into 4 bytes.
1263 uint32_t decimalValue = 0;
1264
1265 // Since parseOctal may be executed before parseDecimal,
1266 // the m_buffer8 may hold ascii digits.
1267 if (!m_buffer8.size()) {
1268 int maximumDigits = 9;
1269 // Temporary buffer for the digits. Makes easier
1270 // to reconstruct the input characters when needed.
1271 LChar digits[10];
1272
1273 do {
1274 decimalValue = decimalValue * 10 + (m_current - '0');
1275 digits[maximumDigits] = m_current;
1276 shift();
1277 --maximumDigits;
1278 } while (isASCIIDigit(m_current) && maximumDigits >= 0);
1279
1280 if (maximumDigits >= 0 && m_current != '.' && (m_current | 0x20) != 'e') {
1281 returnValue = decimalValue;
1282 return true;
1283 }
1284
1285 for (int i = 9; i > maximumDigits; --i)
1286 record8(digits[i]);
1287 }
1288
1289 while (isASCIIDigit(m_current)) {
1290 record8(m_current);
1291 shift();
1292 }
1293
1294 return false;
1295 }
1296
1297 template <typename T>
1298 ALWAYS_INLINE void Lexer<T>::parseNumberAfterDecimalPoint()
1299 {
1300 record8('.');
1301 while (isASCIIDigit(m_current)) {
1302 record8(m_current);
1303 shift();
1304 }
1305 }
1306
1307 template <typename T>
1308 ALWAYS_INLINE bool Lexer<T>::parseNumberAfterExponentIndicator()
1309 {
1310 record8('e');
1311 shift();
1312 if (m_current == '+' || m_current == '-') {
1313 record8(m_current);
1314 shift();
1315 }
1316
1317 if (!isASCIIDigit(m_current))
1318 return false;
1319
1320 do {
1321 record8(m_current);
1322 shift();
1323 } while (isASCIIDigit(m_current));
1324 return true;
1325 }
1326
1327 template <typename T>
1328 ALWAYS_INLINE bool Lexer<T>::parseMultilineComment()
1329 {
1330 while (true) {
1331 while (UNLIKELY(m_current == '*')) {
1332 shift();
1333 if (m_current == '/') {
1334 shift();
1335 return true;
1336 }
1337 }
1338
1339 if (atEnd())
1340 return false;
1341
1342 if (isLineTerminator(m_current)) {
1343 shiftLineTerminator();
1344 m_terminator = true;
1345 } else
1346 shift();
1347 }
1348 }
1349
1350 template <typename T>
1351 bool Lexer<T>::nextTokenIsColon()
1352 {
1353 const T* code = m_code;
1354 while (code < m_codeEnd && (isWhiteSpace(*code) || isLineTerminator(*code)))
1355 code++;
1356
1357 return code < m_codeEnd && *code == ':';
1358 }
1359
1360 template <typename T>
1361 JSTokenType Lexer<T>::lex(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
1362 {
1363 JSTokenData* tokenData = &tokenRecord->m_data;
1364 JSTokenLocation* tokenLocation = &tokenRecord->m_location;
1365 ASSERT(!m_error);
1366 ASSERT(m_buffer8.isEmpty());
1367 ASSERT(m_buffer16.isEmpty());
1368
1369 JSTokenType token = ERRORTOK;
1370 m_terminator = false;
1371
1372 start:
1373 while (isWhiteSpace(m_current))
1374 shift();
1375
1376 if (atEnd())
1377 return EOFTOK;
1378
1379 tokenLocation->startOffset = currentOffset();
1380 ASSERT(currentOffset() >= currentLineStartOffset());
1381 tokenRecord->m_startPosition = currentPosition();
1382
1383 CharacterType type;
1384 if (LIKELY(isLatin1(m_current)))
1385 type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
1386 else if (isNonLatin1IdentStart(m_current))
1387 type = CharacterIdentifierStart;
1388 else if (isLineTerminator(m_current))
1389 type = CharacterLineTerminator;
1390 else
1391 type = CharacterInvalid;
1392
1393 switch (type) {
1394 case CharacterGreater:
1395 shift();
1396 if (m_current == '>') {
1397 shift();
1398 if (m_current == '>') {
1399 shift();
1400 if (m_current == '=') {
1401 shift();
1402 token = URSHIFTEQUAL;
1403 break;
1404 }
1405 token = URSHIFT;
1406 break;
1407 }
1408 if (m_current == '=') {
1409 shift();
1410 token = RSHIFTEQUAL;
1411 break;
1412 }
1413 token = RSHIFT;
1414 break;
1415 }
1416 if (m_current == '=') {
1417 shift();
1418 token = GE;
1419 break;
1420 }
1421 token = GT;
1422 break;
1423 case CharacterEqual:
1424 shift();
1425 if (m_current == '=') {
1426 shift();
1427 if (m_current == '=') {
1428 shift();
1429 token = STREQ;
1430 break;
1431 }
1432 token = EQEQ;
1433 break;
1434 }
1435 token = EQUAL;
1436 break;
1437 case CharacterLess:
1438 shift();
1439 if (m_current == '!' && peek(1) == '-' && peek(2) == '-') {
1440 // <!-- marks the beginning of a line comment (for www usage)
1441 goto inSingleLineComment;
1442 }
1443 if (m_current == '<') {
1444 shift();
1445 if (m_current == '=') {
1446 shift();
1447 token = LSHIFTEQUAL;
1448 break;
1449 }
1450 token = LSHIFT;
1451 break;
1452 }
1453 if (m_current == '=') {
1454 shift();
1455 token = LE;
1456 break;
1457 }
1458 token = LT;
1459 break;
1460 case CharacterExclamationMark:
1461 shift();
1462 if (m_current == '=') {
1463 shift();
1464 if (m_current == '=') {
1465 shift();
1466 token = STRNEQ;
1467 break;
1468 }
1469 token = NE;
1470 break;
1471 }
1472 token = EXCLAMATION;
1473 break;
1474 case CharacterAdd:
1475 shift();
1476 if (m_current == '+') {
1477 shift();
1478 token = (!m_terminator) ? PLUSPLUS : AUTOPLUSPLUS;
1479 break;
1480 }
1481 if (m_current == '=') {
1482 shift();
1483 token = PLUSEQUAL;
1484 break;
1485 }
1486 token = PLUS;
1487 break;
1488 case CharacterSub:
1489 shift();
1490 if (m_current == '-') {
1491 shift();
1492 if (m_atLineStart && m_current == '>') {
1493 shift();
1494 goto inSingleLineComment;
1495 }
1496 token = (!m_terminator) ? MINUSMINUS : AUTOMINUSMINUS;
1497 break;
1498 }
1499 if (m_current == '=') {
1500 shift();
1501 token = MINUSEQUAL;
1502 break;
1503 }
1504 token = MINUS;
1505 break;
1506 case CharacterMultiply:
1507 shift();
1508 if (m_current == '=') {
1509 shift();
1510 token = MULTEQUAL;
1511 break;
1512 }
1513 token = TIMES;
1514 break;
1515 case CharacterSlash:
1516 shift();
1517 if (m_current == '/') {
1518 shift();
1519 goto inSingleLineComment;
1520 }
1521 if (m_current == '*') {
1522 shift();
1523 if (parseMultilineComment())
1524 goto start;
1525 m_lexErrorMessage = "Multiline comment was not closed properly";
1526 token = UNTERMINATED_MULTILINE_COMMENT_ERRORTOK;
1527 goto returnError;
1528 }
1529 if (m_current == '=') {
1530 shift();
1531 token = DIVEQUAL;
1532 break;
1533 }
1534 token = DIVIDE;
1535 break;
1536 case CharacterAnd:
1537 shift();
1538 if (m_current == '&') {
1539 shift();
1540 token = AND;
1541 break;
1542 }
1543 if (m_current == '=') {
1544 shift();
1545 token = ANDEQUAL;
1546 break;
1547 }
1548 token = BITAND;
1549 break;
1550 case CharacterXor:
1551 shift();
1552 if (m_current == '=') {
1553 shift();
1554 token = XOREQUAL;
1555 break;
1556 }
1557 token = BITXOR;
1558 break;
1559 case CharacterModulo:
1560 shift();
1561 if (m_current == '=') {
1562 shift();
1563 token = MODEQUAL;
1564 break;
1565 }
1566 token = MOD;
1567 break;
1568 case CharacterOr:
1569 shift();
1570 if (m_current == '=') {
1571 shift();
1572 token = OREQUAL;
1573 break;
1574 }
1575 if (m_current == '|') {
1576 shift();
1577 token = OR;
1578 break;
1579 }
1580 token = BITOR;
1581 break;
1582 case CharacterOpenParen:
1583 token = OPENPAREN;
1584 shift();
1585 break;
1586 case CharacterCloseParen:
1587 token = CLOSEPAREN;
1588 shift();
1589 break;
1590 case CharacterOpenBracket:
1591 token = OPENBRACKET;
1592 shift();
1593 break;
1594 case CharacterCloseBracket:
1595 token = CLOSEBRACKET;
1596 shift();
1597 break;
1598 case CharacterComma:
1599 token = COMMA;
1600 shift();
1601 break;
1602 case CharacterColon:
1603 token = COLON;
1604 shift();
1605 break;
1606 case CharacterQuestion:
1607 token = QUESTION;
1608 shift();
1609 break;
1610 case CharacterTilde:
1611 token = TILDE;
1612 shift();
1613 break;
1614 case CharacterSemicolon:
1615 shift();
1616 token = SEMICOLON;
1617 break;
1618 case CharacterOpenBrace:
1619 tokenData->line = lineNumber();
1620 tokenData->offset = currentOffset();
1621 tokenData->lineStartOffset = currentLineStartOffset();
1622 ASSERT(tokenData->offset >= tokenData->lineStartOffset);
1623 shift();
1624 token = OPENBRACE;
1625 break;
1626 case CharacterCloseBrace:
1627 tokenData->line = lineNumber();
1628 tokenData->offset = currentOffset();
1629 tokenData->lineStartOffset = currentLineStartOffset();
1630 ASSERT(tokenData->offset >= tokenData->lineStartOffset);
1631 shift();
1632 token = CLOSEBRACE;
1633 break;
1634 case CharacterDot:
1635 shift();
1636 if (!isASCIIDigit(m_current)) {
1637 if (UNLIKELY((m_current == '.') && (peek(1) == '.'))) {
1638 shift();
1639 shift();
1640 token = DOTDOTDOT;
1641 break;
1642 }
1643 token = DOT;
1644 break;
1645 }
1646 goto inNumberAfterDecimalPoint;
1647 case CharacterZero:
1648 shift();
1649 if ((m_current | 0x20) == 'x') {
1650 if (!isASCIIHexDigit(peek(1))) {
1651 m_lexErrorMessage = "No hexadecimal digits after '0x'";
1652 token = INVALID_HEX_NUMBER_ERRORTOK;
1653 goto returnError;
1654 }
1655 parseHex(tokenData->doubleValue);
1656 if (isIdentStart(m_current)) {
1657 m_lexErrorMessage = "No space between hexadecimal literal and identifier";
1658 token = INVALID_HEX_NUMBER_ERRORTOK;
1659 goto returnError;
1660 }
1661 token = NUMBER;
1662 m_buffer8.resize(0);
1663 break;
1664 }
1665
1666 record8('0');
1667 if (isASCIIOctalDigit(m_current)) {
1668 if (parseOctal(tokenData->doubleValue)) {
1669 if (strictMode) {
1670 m_lexErrorMessage = "Octal escapes are forbidden in strict mode";
1671 token = INVALID_OCTAL_NUMBER_ERRORTOK;
1672 goto returnError;
1673 }
1674 token = NUMBER;
1675 }
1676 }
1677 FALLTHROUGH;
1678 case CharacterNumber:
1679 if (LIKELY(token != NUMBER)) {
1680 if (!parseDecimal(tokenData->doubleValue)) {
1681 if (m_current == '.') {
1682 shift();
1683 inNumberAfterDecimalPoint:
1684 parseNumberAfterDecimalPoint();
1685 }
1686 if ((m_current | 0x20) == 'e') {
1687 if (!parseNumberAfterExponentIndicator()) {
1688 m_lexErrorMessage = "Non-number found after exponent indicator";
1689 token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
1690 goto returnError;
1691 }
1692 }
1693 size_t parsedLength;
1694 tokenData->doubleValue = parseDouble(m_buffer8.data(), m_buffer8.size(), parsedLength);
1695 }
1696 token = NUMBER;
1697 }
1698
1699 // No identifiers allowed directly after numeric literal, e.g. "3in" is bad.
1700 if (UNLIKELY(isIdentStart(m_current))) {
1701 m_lexErrorMessage = "At least one digit must occur after a decimal point";
1702 token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
1703 goto returnError;
1704 }
1705 m_buffer8.resize(0);
1706 break;
1707 case CharacterQuote:
1708 if (lexerFlags & LexerFlagsDontBuildStrings) {
1709 StringParseResult result = parseString<false>(tokenData, strictMode);
1710 if (UNLIKELY(result != StringParsedSuccessfully)) {
1711 token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK;
1712 goto returnError;
1713 }
1714 } else {
1715 StringParseResult result = parseString<true>(tokenData, strictMode);
1716 if (UNLIKELY(result != StringParsedSuccessfully)) {
1717 token = result == StringUnterminated ? UNTERMINATED_STRING_LITERAL_ERRORTOK : INVALID_STRING_LITERAL_ERRORTOK;
1718 goto returnError;
1719 }
1720 }
1721 shift();
1722 token = STRING;
1723 break;
1724 case CharacterIdentifierStart:
1725 ASSERT(isIdentStart(m_current));
1726 FALLTHROUGH;
1727 case CharacterBackSlash:
1728 parseIdent:
1729 if (lexerFlags & LexexFlagsDontBuildKeywords)
1730 token = parseIdentifier<false>(tokenData, lexerFlags, strictMode);
1731 else
1732 token = parseIdentifier<true>(tokenData, lexerFlags, strictMode);
1733 break;
1734 case CharacterLineTerminator:
1735 ASSERT(isLineTerminator(m_current));
1736 shiftLineTerminator();
1737 m_atLineStart = true;
1738 m_terminator = true;
1739 m_lineStart = m_code;
1740 goto start;
1741 case CharacterPrivateIdentifierStart:
1742 if (m_parsingBuiltinFunction)
1743 goto parseIdent;
1744
1745 FALLTHROUGH;
1746 case CharacterInvalid:
1747 m_lexErrorMessage = invalidCharacterMessage();
1748 token = ERRORTOK;
1749 goto returnError;
1750 default:
1751 RELEASE_ASSERT_NOT_REACHED();
1752 m_lexErrorMessage = "Internal Error";
1753 token = ERRORTOK;
1754 goto returnError;
1755 }
1756
1757 m_atLineStart = false;
1758 goto returnToken;
1759
1760 inSingleLineComment:
1761 while (!isLineTerminator(m_current)) {
1762 if (atEnd())
1763 return EOFTOK;
1764 shift();
1765 }
1766 shiftLineTerminator();
1767 m_atLineStart = true;
1768 m_terminator = true;
1769 m_lineStart = m_code;
1770 if (!lastTokenWasRestrKeyword())
1771 goto start;
1772
1773 token = SEMICOLON;
1774 // Fall through into returnToken.
1775
1776 returnToken:
1777 tokenLocation->line = m_lineNumber;
1778 tokenLocation->endOffset = currentOffset();
1779 tokenLocation->lineStartOffset = currentLineStartOffset();
1780 ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset);
1781 tokenRecord->m_endPosition = currentPosition();
1782 m_lastToken = token;
1783 return token;
1784
1785 returnError:
1786 m_error = true;
1787 tokenLocation->line = m_lineNumber;
1788 tokenLocation->endOffset = currentOffset();
1789 tokenLocation->lineStartOffset = currentLineStartOffset();
1790 ASSERT(tokenLocation->endOffset >= tokenLocation->lineStartOffset);
1791 tokenRecord->m_endPosition = currentPosition();
1792 RELEASE_ASSERT(token & ErrorTokenFlag);
1793 return token;
1794 }
1795
1796 template <typename T>
1797 static inline void orCharacter(UChar&, UChar);
1798
1799 template <>
1800 inline void orCharacter<LChar>(UChar&, UChar) { }
1801
1802 template <>
1803 inline void orCharacter<UChar>(UChar& orAccumulator, UChar character)
1804 {
1805 orAccumulator |= character;
1806 }
1807
1808 template <typename T>
1809 bool Lexer<T>::scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix)
1810 {
1811 ASSERT(m_buffer16.isEmpty());
1812
1813 bool lastWasEscape = false;
1814 bool inBrackets = false;
1815 UChar charactersOredTogether = 0;
1816
1817 if (patternPrefix) {
1818 ASSERT(!isLineTerminator(patternPrefix));
1819 ASSERT(patternPrefix != '/');
1820 ASSERT(patternPrefix != '[');
1821 record16(patternPrefix);
1822 }
1823
1824 while (true) {
1825 if (isLineTerminator(m_current) || atEnd()) {
1826 m_buffer16.resize(0);
1827 return false;
1828 }
1829
1830 T prev = m_current;
1831
1832 shift();
1833
1834 if (prev == '/' && !lastWasEscape && !inBrackets)
1835 break;
1836
1837 record16(prev);
1838 orCharacter<T>(charactersOredTogether, prev);
1839
1840 if (lastWasEscape) {
1841 lastWasEscape = false;
1842 continue;
1843 }
1844
1845 switch (prev) {
1846 case '[':
1847 inBrackets = true;
1848 break;
1849 case ']':
1850 inBrackets = false;
1851 break;
1852 case '\\':
1853 lastWasEscape = true;
1854 break;
1855 }
1856 }
1857
1858 pattern = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
1859
1860 m_buffer16.resize(0);
1861 charactersOredTogether = 0;
1862
1863 while (isIdentPart(m_current)) {
1864 record16(m_current);
1865 orCharacter<T>(charactersOredTogether, m_current);
1866 shift();
1867 }
1868
1869 flags = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
1870 m_buffer16.resize(0);
1871
1872 return true;
1873 }
1874
1875 template <typename T>
1876 bool Lexer<T>::skipRegExp()
1877 {
1878 bool lastWasEscape = false;
1879 bool inBrackets = false;
1880
1881 while (true) {
1882 if (isLineTerminator(m_current) || atEnd())
1883 return false;
1884
1885 T prev = m_current;
1886
1887 shift();
1888
1889 if (prev == '/' && !lastWasEscape && !inBrackets)
1890 break;
1891
1892 if (lastWasEscape) {
1893 lastWasEscape = false;
1894 continue;
1895 }
1896
1897 switch (prev) {
1898 case '[':
1899 inBrackets = true;
1900 break;
1901 case ']':
1902 inBrackets = false;
1903 break;
1904 case '\\':
1905 lastWasEscape = true;
1906 break;
1907 }
1908 }
1909
1910 while (isIdentPart(m_current))
1911 shift();
1912
1913 return true;
1914 }
1915
1916 template <typename T>
1917 void Lexer<T>::clear()
1918 {
1919 m_arena = 0;
1920
1921 Vector<LChar> newBuffer8;
1922 m_buffer8.swap(newBuffer8);
1923
1924 Vector<UChar> newBuffer16;
1925 m_buffer16.swap(newBuffer16);
1926
1927 m_isReparsing = false;
1928 }
1929
1930 // Instantiate the two flavors of Lexer we need instead of putting most of this file in Lexer.h
1931 template class Lexer<LChar>;
1932 template class Lexer<UChar>;
1933
1934 } // namespace JSC