]> git.saurik.com Git - apple/javascriptcore.git/blame_incremental - parser/Lexer.h
JavaScriptCore-7600.1.4.15.12.tar.gz
[apple/javascriptcore.git] / parser / Lexer.h
... / ...
CommitLineData
1/*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved.
4 * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 *
21 */
22
23#ifndef Lexer_h
24#define Lexer_h
25
26#include "Lookup.h"
27#include "ParserArena.h"
28#include "ParserTokens.h"
29#include "SourceCode.h"
30#include <wtf/ASCIICType.h>
31#include <wtf/SegmentedVector.h>
32#include <wtf/Vector.h>
33
34namespace JSC {
35
36class Keywords {
37public:
38 bool isKeyword(const Identifier& ident) const
39 {
40 return m_keywordTable.entry(m_vm, ident);
41 }
42
43 const HashTableValue* getKeyword(const Identifier& ident) const
44 {
45 return m_keywordTable.entry(m_vm, ident);
46 }
47
48 ~Keywords()
49 {
50 m_keywordTable.deleteTable();
51 }
52
53private:
54 friend class VM;
55
56 explicit Keywords(VM&);
57
58 VM& m_vm;
59 const HashTable m_keywordTable;
60};
61
62enum LexerFlags {
63 LexerFlagsIgnoreReservedWords = 1,
64 LexerFlagsDontBuildStrings = 2,
65 LexexFlagsDontBuildKeywords = 4
66};
67
68template <typename T>
69class Lexer {
70 WTF_MAKE_NONCOPYABLE(Lexer);
71 WTF_MAKE_FAST_ALLOCATED;
72
73public:
74 Lexer(VM*, JSParserStrictness);
75 ~Lexer();
76
77 // Character manipulation functions.
78 static bool isWhiteSpace(T character);
79 static bool isLineTerminator(T character);
80 static unsigned char convertHex(int c1, int c2);
81 static UChar convertUnicode(int c1, int c2, int c3, int c4);
82
83 // Functions to set up parsing.
84 void setCode(const SourceCode&, ParserArena*);
85 void setIsReparsing() { m_isReparsing = true; }
86 bool isReparsing() const { return m_isReparsing; }
87
88 JSTokenType lex(JSToken*, unsigned, bool strictMode);
89 bool nextTokenIsColon();
90 int lineNumber() const { return m_lineNumber; }
91 ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
92 ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
93 ALWAYS_INLINE JSTextPosition currentPosition() const
94 {
95 return JSTextPosition(m_lineNumber, currentOffset(), currentLineStartOffset());
96 }
97 JSTextPosition positionBeforeLastNewline() const { return m_positionBeforeLastNewline; }
98 void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
99 int lastLineNumber() const { return m_lastLineNumber; }
100 bool prevTerminator() const { return m_terminator; }
101 bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
102 bool skipRegExp();
103
104 // Functions for use after parsing.
105 bool sawError() const { return m_error; }
106 String getErrorMessage() const { return m_lexErrorMessage; }
107 void clear();
108 void setOffset(int offset, int lineStartOffset)
109 {
110 m_error = 0;
111 m_lexErrorMessage = String();
112
113 m_code = sourcePtrFromOffset(offset);
114 m_lineStart = sourcePtrFromOffset(lineStartOffset);
115 ASSERT(currentOffset() >= currentLineStartOffset());
116
117 m_buffer8.resize(0);
118 m_buffer16.resize(0);
119 if (LIKELY(m_code < m_codeEnd))
120 m_current = *m_code;
121 else
122 m_current = 0;
123 }
124 void setLineNumber(int line)
125 {
126 m_lineNumber = line;
127 }
128
129 SourceProvider* sourceProvider() const { return m_source->provider(); }
130
131 JSTokenType lexExpectIdentifier(JSToken*, unsigned, bool strictMode);
132
133private:
134 void record8(int);
135 void append8(const T*, size_t);
136 void record16(int);
137 void record16(T);
138 void append16(const LChar*, size_t);
139 void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
140
141 ALWAYS_INLINE void shift();
142 ALWAYS_INLINE bool atEnd() const;
143 ALWAYS_INLINE T peek(int offset) const;
144 struct UnicodeHexValue {
145
146 enum ValueType { ValidHex, IncompleteHex, InvalidHex };
147
148 explicit UnicodeHexValue(int value)
149 : m_value(value)
150 {
151 }
152 explicit UnicodeHexValue(ValueType type)
153 : m_value(type == IncompleteHex ? -2 : -1)
154 {
155 }
156
157 ValueType valueType() const
158 {
159 if (m_value >= 0)
160 return ValidHex;
161 return m_value == -2 ? IncompleteHex : InvalidHex;
162 }
163 bool isValid() const { return m_value >= 0; }
164 int value() const
165 {
166 ASSERT(m_value >= 0);
167 return m_value;
168 }
169
170 private:
171 int m_value;
172 };
173 UnicodeHexValue parseFourDigitUnicodeHex();
174 void shiftLineTerminator();
175
176 ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
177 ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
178
179 String invalidCharacterMessage() const;
180 ALWAYS_INLINE const T* currentSourcePtr() const;
181 ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
182
183 ALWAYS_INLINE void setCodeStart(const StringImpl*);
184
185 ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
186 ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
187 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
188 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
189 ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
190 ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
191
192 ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
193
194 template <int shiftAmount> void internalShift();
195 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
196 template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
197 template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
198 enum StringParseResult {
199 StringParsedSuccessfully,
200 StringUnterminated,
201 StringCannotBeParsed
202 };
203 template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode);
204 template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode);
205 ALWAYS_INLINE void parseHex(double& returnValue);
206 ALWAYS_INLINE bool parseOctal(double& returnValue);
207 ALWAYS_INLINE bool parseDecimal(double& returnValue);
208 ALWAYS_INLINE void parseNumberAfterDecimalPoint();
209 ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
210 ALWAYS_INLINE bool parseMultilineComment();
211
212 static const size_t initialReadBufferCapacity = 32;
213
214 int m_lineNumber;
215 int m_lastLineNumber;
216
217 Vector<LChar> m_buffer8;
218 Vector<UChar> m_buffer16;
219 bool m_terminator;
220 int m_lastToken;
221
222 const SourceCode* m_source;
223 unsigned m_sourceOffset;
224 const T* m_code;
225 const T* m_codeStart;
226 const T* m_codeEnd;
227 const T* m_codeStartPlusOffset;
228 const T* m_lineStart;
229 JSTextPosition m_positionBeforeLastNewline;
230 bool m_isReparsing;
231 bool m_atLineStart;
232 bool m_error;
233 String m_lexErrorMessage;
234
235 T m_current;
236
237 IdentifierArena* m_arena;
238
239 VM* m_vm;
240 bool m_parsingBuiltinFunction;
241};
242
243template <>
244ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
245{
246 return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
247}
248
249template <>
250ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
251{
252 // 0x180E used to be in Zs category before Unicode 6.3, and EcmaScript says that we should keep treating it as such.
253 return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (u_charType(ch) == U_SPACE_SEPARATOR || ch == 0x180E || ch == 0xFEFF);
254}
255
256template <>
257ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
258{
259 return ch == '\r' || ch == '\n';
260}
261
262template <>
263ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
264{
265 return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
266}
267
268template <typename T>
269inline unsigned char Lexer<T>::convertHex(int c1, int c2)
270{
271 return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
272}
273
274template <typename T>
275inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
276{
277 return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
278}
279
280template <typename T>
281ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
282{
283 return &m_arena->makeIdentifier(m_vm, characters, length);
284}
285
286template <typename T>
287ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
288{
289 return &m_arena->makeIdentifier(m_vm, characters, length);
290}
291
292template <>
293ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
294{
295 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
296}
297
298template <>
299ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
300{
301 if (!(orAllChars & ~0xff))
302 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
303
304 return &m_arena->makeIdentifier(m_vm, characters, length);
305}
306
307template <>
308ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
309{
310 ASSERT(sourceString->is8Bit());
311 m_codeStart = sourceString->characters8();
312}
313
314template <>
315ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
316{
317 ASSERT(!sourceString->is8Bit());
318 m_codeStart = sourceString->characters16();
319}
320
321template <typename T>
322ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
323{
324 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
325}
326
327template <typename T>
328ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
329{
330 return &m_arena->makeIdentifier(m_vm, characters, length);
331}
332
333template <typename T>
334ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
335{
336 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
337}
338
339#if ASSERT_DISABLED
340ALWAYS_INLINE bool isSafeBuiltinIdentifier(VM&, const Identifier*) { return true; }
341#else
342bool isSafeBuiltinIdentifier(VM&, const Identifier*);
343#endif
344
345template <typename T>
346ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSToken* tokenRecord, unsigned lexerFlags, bool strictMode)
347{
348 JSTokenData* tokenData = &tokenRecord->m_data;
349 JSTokenLocation* tokenLocation = &tokenRecord->m_location;
350 ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
351 const T* start = m_code;
352 const T* ptr = start;
353 const T* end = m_codeEnd;
354 JSTextPosition startPosition = currentPosition();
355 if (ptr >= end) {
356 ASSERT(ptr == end);
357 goto slowCase;
358 }
359 if (!WTF::isASCIIAlpha(*ptr))
360 goto slowCase;
361 ++ptr;
362 while (ptr < end) {
363 if (!WTF::isASCIIAlphanumeric(*ptr))
364 break;
365 ++ptr;
366 }
367
368 // Here's the shift
369 if (ptr < end) {
370 if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
371 goto slowCase;
372 m_current = *ptr;
373 } else
374 m_current = 0;
375
376 m_code = ptr;
377 ASSERT(currentOffset() >= currentLineStartOffset());
378
379 // Create the identifier if needed
380 if (lexerFlags & LexexFlagsDontBuildKeywords
381#if !ASSERT_DISABLED
382 && !m_parsingBuiltinFunction
383#endif
384 )
385 tokenData->ident = 0;
386 else
387 tokenData->ident = makeLCharIdentifier(start, ptr - start);
388
389 tokenLocation->line = m_lineNumber;
390 tokenLocation->lineStartOffset = currentLineStartOffset();
391 tokenLocation->startOffset = offsetFromSourcePtr(start);
392 tokenLocation->endOffset = currentOffset();
393 ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
394 tokenRecord->m_startPosition = startPosition;
395 tokenRecord->m_endPosition = currentPosition();
396#if !ASSERT_DISABLED
397 if (m_parsingBuiltinFunction) {
398 if (!isSafeBuiltinIdentifier(*m_vm, tokenData->ident))
399 return ERRORTOK;
400 }
401#endif
402
403 m_lastToken = IDENT;
404 return IDENT;
405
406slowCase:
407 return lex(tokenRecord, lexerFlags, strictMode);
408}
409
410} // namespace JSC
411
412#endif // Lexer_h