]> git.saurik.com Git - apple/javascriptcore.git/blob - parser/Lexer.h
JavaScriptCore-1218.34.tar.gz
[apple/javascriptcore.git] / parser / Lexer.h
1 /*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012, 2013 Apple Inc. All rights reserved.
4 * Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 *
21 */
22
23 #ifndef Lexer_h
24 #define Lexer_h
25
26 #include "Lookup.h"
27 #include "ParserArena.h"
28 #include "ParserTokens.h"
29 #include "SourceCode.h"
30 #include <wtf/ASCIICType.h>
31 #include <wtf/SegmentedVector.h>
32 #include <wtf/Vector.h>
33 #include <wtf/unicode/Unicode.h>
34
35 namespace JSC {
36
37 class Keywords {
38 public:
39 bool isKeyword(const Identifier& ident) const
40 {
41 return m_keywordTable.entry(m_vm, ident);
42 }
43
44 const HashEntry* getKeyword(const Identifier& ident) const
45 {
46 return m_keywordTable.entry(m_vm, ident);
47 }
48
49 ~Keywords()
50 {
51 m_keywordTable.deleteTable();
52 }
53
54 private:
55 friend class VM;
56
57 Keywords(VM*);
58
59 VM* m_vm;
60 const HashTable m_keywordTable;
61 };
62
63 enum LexerFlags {
64 LexerFlagsIgnoreReservedWords = 1,
65 LexerFlagsDontBuildStrings = 2,
66 LexexFlagsDontBuildKeywords = 4
67 };
68
69 template <typename T>
70 class Lexer {
71 WTF_MAKE_NONCOPYABLE(Lexer);
72 WTF_MAKE_FAST_ALLOCATED;
73
74 public:
75 Lexer(VM*);
76 ~Lexer();
77
78 // Character manipulation functions.
79 static bool isWhiteSpace(T character);
80 static bool isLineTerminator(T character);
81 static unsigned char convertHex(int c1, int c2);
82 static UChar convertUnicode(int c1, int c2, int c3, int c4);
83
84 // Functions to set up parsing.
85 void setCode(const SourceCode&, ParserArena*);
86 void setIsReparsing() { m_isReparsing = true; }
87 bool isReparsing() const { return m_isReparsing; }
88
89 JSTokenType lex(JSTokenData*, JSTokenLocation*, unsigned, bool strictMode);
90 bool nextTokenIsColon();
91 int lineNumber() const { return m_lineNumber; }
92 ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
93 ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
94 void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
95 int lastLineNumber() const { return m_lastLineNumber; }
96 bool prevTerminator() const { return m_terminator; }
97 SourceCode sourceCode(int openBrace, int closeBrace, int firstLine, unsigned startColumn);
98 bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
99 bool skipRegExp();
100
101 // Functions for use after parsing.
102 bool sawError() const { return m_error; }
103 String getErrorMessage() const { return m_lexErrorMessage; }
104 void clear();
105 void setOffset(int offset, int lineStartOffset)
106 {
107 m_error = 0;
108 m_lexErrorMessage = String();
109
110 m_code = sourcePtrFromOffset(offset);
111 m_lineStart = sourcePtrFromOffset(lineStartOffset);
112 ASSERT(currentOffset() >= currentLineStartOffset());
113
114 m_buffer8.resize(0);
115 m_buffer16.resize(0);
116 if (LIKELY(m_code < m_codeEnd))
117 m_current = *m_code;
118 else
119 m_current = 0;
120 }
121 void setLineNumber(int line)
122 {
123 m_lineNumber = line;
124 }
125
126 SourceProvider* sourceProvider() const { return m_source->provider(); }
127
128 JSTokenType lexExpectIdentifier(JSTokenData*, JSTokenLocation*, unsigned, bool strictMode);
129
130 private:
131 void record8(int);
132 void append8(const T*, size_t);
133 void record16(int);
134 void record16(T);
135 void append16(const LChar*, size_t);
136 void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
137
138 ALWAYS_INLINE void shift();
139 ALWAYS_INLINE bool atEnd() const;
140 ALWAYS_INLINE T peek(int offset) const;
141 struct UnicodeHexValue {
142
143 enum ValueType { ValidHex, IncompleteHex, InvalidHex };
144
145 explicit UnicodeHexValue(int value)
146 : m_value(value)
147 {
148 }
149 explicit UnicodeHexValue(ValueType type)
150 : m_value(type == IncompleteHex ? -2 : -1)
151 {
152 }
153
154 ValueType valueType() const
155 {
156 if (m_value >= 0)
157 return ValidHex;
158 return m_value == -2 ? IncompleteHex : InvalidHex;
159 }
160 bool isValid() const { return m_value >= 0; }
161 int value() const
162 {
163 ASSERT(m_value >= 0);
164 return m_value;
165 }
166
167 private:
168 int m_value;
169 };
170 UnicodeHexValue parseFourDigitUnicodeHex();
171 void shiftLineTerminator();
172
173 ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
174 ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
175
176 String invalidCharacterMessage() const;
177 ALWAYS_INLINE const T* currentSourcePtr() const;
178 ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
179
180 ALWAYS_INLINE void setCodeStart(const StringImpl*);
181
182 ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
183 ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
184 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
185 ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
186 ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
187 ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
188
189 ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
190
191 template <int shiftAmount> void internalShift();
192 template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
193 template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
194 template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
195 enum StringParseResult {
196 StringParsedSuccessfully,
197 StringUnterminated,
198 StringCannotBeParsed
199 };
200 template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData*, bool strictMode);
201 template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData*, bool strictMode);
202 ALWAYS_INLINE void parseHex(double& returnValue);
203 ALWAYS_INLINE bool parseOctal(double& returnValue);
204 ALWAYS_INLINE bool parseDecimal(double& returnValue);
205 ALWAYS_INLINE void parseNumberAfterDecimalPoint();
206 ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
207 ALWAYS_INLINE bool parseMultilineComment();
208
209 static const size_t initialReadBufferCapacity = 32;
210
211 int m_lineNumber;
212 int m_lastLineNumber;
213
214 Vector<LChar> m_buffer8;
215 Vector<UChar> m_buffer16;
216 bool m_terminator;
217 int m_lastToken;
218
219 const SourceCode* m_source;
220 unsigned m_sourceOffset;
221 const T* m_code;
222 const T* m_codeStart;
223 const T* m_codeEnd;
224 const T* m_codeStartPlusOffset;
225 const T* m_lineStart;
226 bool m_isReparsing;
227 bool m_atLineStart;
228 bool m_error;
229 String m_lexErrorMessage;
230
231 T m_current;
232
233 IdentifierArena* m_arena;
234
235 VM* m_vm;
236 };
237
238 template <>
239 ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
240 {
241 return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
242 }
243
244 template <>
245 ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
246 {
247 return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (WTF::Unicode::isSeparatorSpace(ch) || ch == 0xFEFF);
248 }
249
250 template <>
251 ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
252 {
253 return ch == '\r' || ch == '\n';
254 }
255
256 template <>
257 ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
258 {
259 return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
260 }
261
262 template <typename T>
263 inline unsigned char Lexer<T>::convertHex(int c1, int c2)
264 {
265 return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
266 }
267
268 template <typename T>
269 inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
270 {
271 return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
272 }
273
274 template <typename T>
275 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
276 {
277 return &m_arena->makeIdentifier(m_vm, characters, length);
278 }
279
280 template <typename T>
281 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
282 {
283 return &m_arena->makeIdentifier(m_vm, characters, length);
284 }
285
286 template <>
287 ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
288 {
289 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
290 }
291
292 template <>
293 ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
294 {
295 if (!(orAllChars & ~0xff))
296 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
297
298 return &m_arena->makeIdentifier(m_vm, characters, length);
299 }
300
301 template <>
302 ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
303 {
304 ASSERT(sourceString->is8Bit());
305 m_codeStart = sourceString->characters8();
306 }
307
308 template <>
309 ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
310 {
311 ASSERT(!sourceString->is8Bit());
312 m_codeStart = sourceString->characters16();
313 }
314
315 template <typename T>
316 ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
317 {
318 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
319 }
320
321 template <typename T>
322 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
323 {
324 return &m_arena->makeIdentifier(m_vm, characters, length);
325 }
326
327 template <typename T>
328 ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
329 {
330 return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
331 }
332
333 template <typename T>
334 ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSTokenData* tokenData, JSTokenLocation* tokenLocation, unsigned lexerFlags, bool strictMode)
335 {
336 ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
337 const T* start = m_code;
338 const T* ptr = start;
339 const T* end = m_codeEnd;
340 if (ptr >= end) {
341 ASSERT(ptr == end);
342 goto slowCase;
343 }
344 if (!WTF::isASCIIAlpha(*ptr))
345 goto slowCase;
346 ++ptr;
347 while (ptr < end) {
348 if (!WTF::isASCIIAlphanumeric(*ptr))
349 break;
350 ++ptr;
351 }
352
353 // Here's the shift
354 if (ptr < end) {
355 if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
356 goto slowCase;
357 m_current = *ptr;
358 } else
359 m_current = 0;
360
361 m_code = ptr;
362 ASSERT(currentOffset() >= currentLineStartOffset());
363
364 // Create the identifier if needed
365 if (lexerFlags & LexexFlagsDontBuildKeywords)
366 tokenData->ident = 0;
367 else
368 tokenData->ident = makeLCharIdentifier(start, ptr - start);
369 tokenLocation->line = m_lineNumber;
370 tokenLocation->lineStartOffset = currentLineStartOffset();
371 tokenLocation->startOffset = offsetFromSourcePtr(start);
372 tokenLocation->endOffset = currentOffset();
373 ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
374 m_lastToken = IDENT;
375 return IDENT;
376
377 slowCase:
378 return lex(tokenData, tokenLocation, lexerFlags, strictMode);
379 }
380
381 } // namespace JSC
382
383 #endif // Lexer_h