]> git.saurik.com Git - apple/javascriptcore.git/blob - parser/Lexer.h
JavaScriptCore-7601.1.46.3.tar.gz
[apple/javascriptcore.git] / parser / Lexer.h
1 /*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved.
4 * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 *
21 */
22
23 #ifndef Lexer_h
24 #define Lexer_h
25
26 #include "Lookup.h"
27 #include "ParserArena.h"
28 #include "ParserTokens.h"
29 #include "SourceCode.h"
30 #include <wtf/ASCIICType.h>
31 #include <wtf/SegmentedVector.h>
32 #include <wtf/Vector.h>
33
34 namespace JSC {
35
36 class Keywords {
37 public:
38 bool isKeyword(const Identifier& ident) const
39 {
40 return m_keywordTable.entry(ident);
41 }
42
43 const HashTableValue* getKeyword(const Identifier& ident) const
44 {
45 return m_keywordTable.entry(ident);
46 }
47
48 explicit Keywords(VM&);
49
50 ~Keywords()
51 {
52 m_keywordTable.deleteTable();
53 }
54
55 private:
56 friend class VM;
57
58 VM& m_vm;
59 const HashTable m_keywordTable;
60 };
61
62 enum LexerFlags {
63 LexerFlagsIgnoreReservedWords = 1,
64 LexerFlagsDontBuildStrings = 2,
65 LexexFlagsDontBuildKeywords = 4
66 };
67
68 struct ParsedUnicodeEscapeValue;
69
70 template <typename T>
71 class Lexer {
72 WTF_MAKE_NONCOPYABLE(Lexer);
73 WTF_MAKE_FAST_ALLOCATED;
74
75 public:
76 Lexer(VM*, JSParserBuiltinMode);
77 ~Lexer();
78
79 // Character manipulation functions.
80 static bool isWhiteSpace(T character);
81 static bool isLineTerminator(T character);
82 static unsigned char convertHex(int c1, int c2);
83 static UChar convertUnicode(int c1, int c2, int c3, int c4);
84
85 // Functions to set up parsing.
86 void setCode(const SourceCode&, ParserArena*);
87 void setIsReparsing() { m_isReparsing = true; }
88 bool isReparsing() const { return m_isReparsing; }
89
90 #if ENABLE(ES6_ARROWFUNCTION_SYNTAX)
91 void setTokenPosition(JSToken* tokenRecord);
92 #endif
93 JSTokenType lex(JSToken*, unsigned, bool strictMode);
94 bool nextTokenIsColon();
95 int lineNumber() const { return m_lineNumber; }
96 ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
97 ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
98 ALWAYS_INLINE JSTextPosition currentPosition() const
99 {
100 return JSTextPosition(m_lineNumber, currentOffset(), currentLineStartOffset());
101 }
102 JSTextPosition positionBeforeLastNewline() const { return m_positionBeforeLastNewline; }
103 JSTokenLocation lastTokenLocation() const { return m_lastTockenLocation; }
104 void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
105 int lastLineNumber() const { return m_lastLineNumber; }
106 bool prevTerminator() const { return m_terminator; }
107 bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
108 #if ENABLE(ES6_TEMPLATE_LITERAL_SYNTAX)
109 enum class RawStringsBuildMode { BuildRawStrings, DontBuildRawStrings };
110 JSTokenType scanTrailingTemplateString(JSToken*, RawStringsBuildMode);
111 #endif
112 bool skipRegExp();
113
114 // Functions for use after parsing.
115 bool sawError() const { return m_error; }
116 String getErrorMessage() const { return m_lexErrorMessage; }
117 void clear();
118 void setOffset(int offset, int lineStartOffset)
119 {
120 m_error = 0;
121 m_lexErrorMessage = String();
122
123 m_code = sourcePtrFromOffset(offset);
124 m_lineStart = sourcePtrFromOffset(lineStartOffset);
125 ASSERT(currentOffset() >= currentLineStartOffset());
126
127 m_buffer8.resize(0);
128 m_buffer16.resize(0);
129 if (LIKELY(m_code < m_codeEnd))
130 m_current = *m_code;
131 else
132 m_current = 0;
133 }
134 void setLineNumber(int line)
135 {
136 m_lineNumber = line;
137 }
138 void setTerminator(bool terminator)
139 {
140 m_terminator = terminator;
141 }
142
143 SourceProvider* sourceProvider() const { return m_source->provider(); }
144
145 JSTokenType lexExpectIdentifier(JSToken*, unsigned, bool strictMode);
146
147 private:
148 void record8(int);
149 void append8(const T*, size_t);
150 void record16(int);
151 void record16(T);
152 void recordUnicodeCodePoint(UChar32);
153 void append16(const LChar*, size_t);
154 void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
155
156 ALWAYS_INLINE void shift();
157 ALWAYS_INLINE bool atEnd() const;
158 ALWAYS_INLINE T peek(int offset) const;
159
160 ParsedUnicodeEscapeValue parseUnicodeEscape();
161 void shiftLineTerminator();
162
163 ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
164 ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
165
166 String invalidCharacterMessage() const;
167 ALWAYS_INLINE const T* currentSourcePtr() const;
168 ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
169
170 ALWAYS_INLINE void setCodeStart(const StringImpl*);
171
172 ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
173 ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
174 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
175 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
176 ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
177 ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
178 ALWAYS_INLINE const Identifier* makeEmptyIdentifier();
179
180 ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
181
182 template <int shiftAmount> void internalShift();
183 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
184 template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
185 template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
186 enum StringParseResult {
187 StringParsedSuccessfully,
188 StringUnterminated,
189 StringCannotBeParsed
190 };
191 template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode);
192 template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode);
193
194 enum class EscapeParseMode { Template, String };
195 template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseComplexEscape(EscapeParseMode, bool strictMode, T stringQuoteCharacter);
196 #if ENABLE(ES6_TEMPLATE_LITERAL_SYNTAX)
197 template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseTemplateLiteral(JSTokenData*, RawStringsBuildMode);
198 #endif
199 ALWAYS_INLINE void parseHex(double& returnValue);
200 ALWAYS_INLINE bool parseBinary(double& returnValue);
201 ALWAYS_INLINE bool parseOctal(double& returnValue);
202 ALWAYS_INLINE bool parseDecimal(double& returnValue);
203 ALWAYS_INLINE void parseNumberAfterDecimalPoint();
204 ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
205 ALWAYS_INLINE bool parseMultilineComment();
206
207 static const size_t initialReadBufferCapacity = 32;
208
209 int m_lineNumber;
210 int m_lastLineNumber;
211
212 Vector<LChar> m_buffer8;
213 Vector<UChar> m_buffer16;
214 Vector<UChar> m_bufferForRawTemplateString16;
215 bool m_terminator;
216 int m_lastToken;
217
218 const SourceCode* m_source;
219 unsigned m_sourceOffset;
220 const T* m_code;
221 const T* m_codeStart;
222 const T* m_codeEnd;
223 const T* m_codeStartPlusOffset;
224 const T* m_lineStart;
225 JSTextPosition m_positionBeforeLastNewline;
226 JSTokenLocation m_lastTockenLocation;
227 bool m_isReparsing;
228 bool m_atLineStart;
229 bool m_error;
230 String m_lexErrorMessage;
231
232 T m_current;
233
234 IdentifierArena* m_arena;
235
236 VM* m_vm;
237 bool m_parsingBuiltinFunction;
238 };
239
240 template <>
241 ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
242 {
243 return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
244 }
245
246 template <>
247 ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
248 {
249 // 0x180E used to be in Zs category before Unicode 6.3, and EcmaScript says that we should keep treating it as such.
250 return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (u_charType(ch) == U_SPACE_SEPARATOR || ch == 0x180E || ch == 0xFEFF);
251 }
252
253 template <>
254 ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
255 {
256 return ch == '\r' || ch == '\n';
257 }
258
259 template <>
260 ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
261 {
262 return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
263 }
264
265 template <typename T>
266 inline unsigned char Lexer<T>::convertHex(int c1, int c2)
267 {
268 return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
269 }
270
271 template <typename T>
272 inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
273 {
274 return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
275 }
276
277 template <typename T>
278 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
279 {
280 return &m_arena->makeIdentifier(m_vm, characters, length);
281 }
282
283 template <typename T>
284 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
285 {
286 return &m_arena->makeIdentifier(m_vm, characters, length);
287 }
288
289 template <>
290 ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
291 {
292 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
293 }
294
295 template <>
296 ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
297 {
298 if (!(orAllChars & ~0xff))
299 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
300
301 return &m_arena->makeIdentifier(m_vm, characters, length);
302 }
303
304 template <typename T>
305 ALWAYS_INLINE const Identifier* Lexer<T>::makeEmptyIdentifier()
306 {
307 return &m_arena->makeEmptyIdentifier(m_vm);
308 }
309
310 template <>
311 ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
312 {
313 ASSERT(sourceString->is8Bit());
314 m_codeStart = sourceString->characters8();
315 }
316
317 template <>
318 ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
319 {
320 ASSERT(!sourceString->is8Bit());
321 m_codeStart = sourceString->characters16();
322 }
323
324 template <typename T>
325 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
326 {
327 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
328 }
329
330 template <typename T>
331 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
332 {
333 return &m_arena->makeIdentifier(m_vm, characters, length);
334 }
335
336 template <typename T>
337 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
338 {
339 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
340 }
341
342 #if ASSERT_DISABLED
343 ALWAYS_INLINE bool isSafeBuiltinIdentifier(VM&, const Identifier*) { return true; }
344 #else
345 bool isSafeBuiltinIdentifier(VM&, const Identifier*);
346 #endif
347
348 template <typename T>
349 ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
350 {
351 JSTokenData* tokenData = &tokenRecord->m_data;
352 JSTokenLocation* tokenLocation = &tokenRecord->m_location;
353 ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
354 const T* start = m_code;
355 const T* ptr = start;
356 const T* end = m_codeEnd;
357 JSTextPosition startPosition = currentPosition();
358 if (ptr >= end) {
359 ASSERT(ptr == end);
360 goto slowCase;
361 }
362 if (!WTF::isASCIIAlpha(*ptr))
363 goto slowCase;
364 ++ptr;
365 while (ptr < end) {
366 if (!WTF::isASCIIAlphanumeric(*ptr))
367 break;
368 ++ptr;
369 }
370
371 // Here's the shift
372 if (ptr < end) {
373 if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
374 goto slowCase;
375 m_current = *ptr;
376 } else
377 m_current = 0;
378
379 m_code = ptr;
380 ASSERT(currentOffset() >= currentLineStartOffset());
381
382 // Create the identifier if needed
383 if (lexerFlags & LexexFlagsDontBuildKeywords
384 #if !ASSERT_DISABLED
385 && !m_parsingBuiltinFunction
386 #endif
387 )
388 tokenData->ident = 0;
389 else
390 tokenData->ident = makeLCharIdentifier(start, ptr - start);
391
392 tokenLocation->line = m_lineNumber;
393 tokenLocation->lineStartOffset = currentLineStartOffset();
394 tokenLocation->startOffset = offsetFromSourcePtr(start);
395 tokenLocation->endOffset = currentOffset();
396 ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
397 tokenRecord->m_startPosition = startPosition;
398 tokenRecord->m_endPosition = currentPosition();
399 #if !ASSERT_DISABLED
400 if (m_parsingBuiltinFunction) {
401 if (!isSafeBuiltinIdentifier(*m_vm, tokenData->ident))
402 return ERRORTOK;
403 }
404 #endif
405
406 m_lastToken = IDENT;
407 return IDENT;
408
409 slowCase:
410 return lex(tokenRecord, lexerFlags, strictMode);
411 }
412
413 } // namespace JSC
414
415 #endif // Lexer_h