[apple/javascriptcore.git] / parser / Lexer.h

/*
 *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
 *  Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012 Apple Inc. All rights reserved.
 *  Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Library General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Library General Public License for more details.
 *
 *  You should have received a copy of the GNU Library General Public License
 *  along with this library; see the file COPYING.LIB.  If not, write to
 *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 *  Boston, MA 02110-1301, USA.
 *
 */

#ifndef Lexer_h
#define Lexer_h

#include "Lookup.h"
#include "ParserArena.h"
#include "ParserTokens.h"
#include "SourceCode.h"
#include <wtf/ASCIICType.h>
#include <wtf/AlwaysInline.h>
#include <wtf/SegmentedVector.h>
#include <wtf/Vector.h>
#include <wtf/unicode/Unicode.h>

namespace JSC {

class Keywords {
public:
    bool isKeyword(const Identifier& ident) const
    {
        return m_keywordTable.entry(m_globalData, ident);
    }
    
    const HashEntry* getKeyword(const Identifier& ident) const
    {
        return m_keywordTable.entry(m_globalData, ident);
    }
    
    ~Keywords()
    {
        m_keywordTable.deleteTable();
    }
    
private:
    friend class JSGlobalData;
    
    Keywords(JSGlobalData*);
    
    JSGlobalData* m_globalData;
    const HashTable m_keywordTable;
};

enum LexerFlags {
    LexerFlagsIgnoreReservedWords = 1, 
    LexerFlagsDontBuildStrings = 2,
    LexexFlagsDontBuildKeywords = 4
};

template <typename T>
class Lexer {
    WTF_MAKE_NONCOPYABLE(Lexer);
    WTF_MAKE_FAST_ALLOCATED;

public:
    Lexer(JSGlobalData*);
    ~Lexer();

    // Character manipulation functions.
    static bool isWhiteSpace(T character);
    static bool isLineTerminator(T character);
    static unsigned char convertHex(int c1, int c2);
    static UChar convertUnicode(int c1, int c2, int c3, int c4);

    // Functions to set up parsing.
    void setCode(const SourceCode&, ParserArena*);
    void setIsReparsing() { m_isReparsing = true; }
    bool isReparsing() const { return m_isReparsing; }

    JSTokenType lex(JSTokenData*, JSTokenInfo*, unsigned, bool strictMode);
    bool nextTokenIsColon();
    int lineNumber() const { return m_lineNumber; }
    void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
    int lastLineNumber() const { return m_lastLineNumber; }
    bool prevTerminator() const { return m_terminator; }
    SourceCode sourceCode(int openBrace, int closeBrace, int firstLine);
    bool scanRegExp(const Identifier*& pattern, const Identifier*& flags, UChar patternPrefix = 0);
    bool skipRegExp();

    // Functions for use after parsing.
    bool sawError() const { return m_error; }
    UString getErrorMessage() const { return m_lexErrorMessage; }
    void clear();
    void setOffset(int offset)
    {
        m_error = 0;
        m_lexErrorMessage = UString();
        m_code = m_codeStart + offset;
        m_buffer8.resize(0);
        m_buffer16.resize(0);
        if (LIKELY(m_code < m_codeEnd))
            m_current = *m_code;
        else
            m_current = 0;
    }
    void setLineNumber(int line)
    {
        m_lineNumber = line;
    }

    SourceProvider* sourceProvider() const { return m_source->provider(); }

    JSTokenType lexExpectIdentifier(JSTokenData*, JSTokenInfo*, unsigned, bool strictMode);

private:
    void record8(int);
    void append8(const T*, size_t);
    void record16(int);
    void record16(T);
    void append16(const LChar*, size_t);
    void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }

    ALWAYS_INLINE void shift();
    ALWAYS_INLINE bool atEnd() const;
    ALWAYS_INLINE T peek(int offset) const;
    int parseFourDigitUnicodeHex();
    void shiftLineTerminator();

    UString invalidCharacterMessage() const;
    ALWAYS_INLINE const T* currentCharacter() const;
    ALWAYS_INLINE int currentOffset() const { return m_code - m_codeStart; }
    ALWAYS_INLINE void setOffsetFromCharOffset(const T* charOffset) { setOffset(charOffset - m_codeStart); }

    ALWAYS_INLINE void setCodeStart(const StringImpl*);

    ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
    ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
    ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);

    ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;

    template <int shiftAmount> void internalShift();
    template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
    template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
    template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
    template <bool shouldBuildStrings> ALWAYS_INLINE bool parseString(JSTokenData*, bool strictMode);
    template <bool shouldBuildStrings> NEVER_INLINE bool parseStringSlowCase(JSTokenData*, bool strictMode);
    ALWAYS_INLINE void parseHex(double& returnValue);
    ALWAYS_INLINE bool parseOctal(double& returnValue);
    ALWAYS_INLINE bool parseDecimal(double& returnValue);
    ALWAYS_INLINE void parseNumberAfterDecimalPoint();
    ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
    ALWAYS_INLINE bool parseMultilineComment();

    static const size_t initialReadBufferCapacity = 32;

    int m_lineNumber;
    int m_lastLineNumber;

    Vector<LChar> m_buffer8;
    Vector<UChar> m_buffer16;
    bool m_terminator;
    int m_lastToken;

    const SourceCode* m_source;
    const T* m_code;
    const T* m_codeStart;
    const T* m_codeEnd;
    bool m_isReparsing;
    bool m_atLineStart;
    bool m_error;
    UString m_lexErrorMessage;

    T m_current;

    IdentifierArena* m_arena;

    JSGlobalData* m_globalData;
};

template <>
ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
{
    return ch == ' ' || ch == '\t' || ch == 0xB || ch == 0xC || ch == 0xA0;
}

template <>
ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
{
    return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (WTF::Unicode::isSeparatorSpace(ch) || ch == 0xFEFF);
}

template <>
ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
{
    return ch == '\r' || ch == '\n';
}

template <>
ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
{
    return ch == '\r' || ch == '\n' || (ch & ~1) == 0x2028;
}

template <typename T>
inline unsigned char Lexer<T>::convertHex(int c1, int c2)
{
    return (toASCIIHexValue(c1) << 4) | toASCIIHexValue(c2);
}

template <typename T>
inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
{
    return (convertHex(c1, c2) << 8) | convertHex(c3, c4);
}

template <typename T>
ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
{
    return &m_arena->makeIdentifier(m_globalData, characters, length);
}

template <typename T>
ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
{
    return &m_arena->makeIdentifier(m_globalData, characters, length);
}

template <>
ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
{
    ASSERT(sourceString->is8Bit());
    m_codeStart = sourceString->characters8();
}

template <>
ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
{
    ASSERT(!sourceString->is8Bit());
    m_codeStart = sourceString->characters16();
}

template <typename T>
ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
{
    return &m_arena->makeIdentifierLCharFromUChar(m_globalData, characters, length);
}

template <typename T>
ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSTokenData* tokenData, JSTokenInfo* tokenInfo, unsigned lexerFlags, bool strictMode)
{
    ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
    const T* start = m_code;
    const T* ptr = start;
    const T* end = m_codeEnd;
    if (ptr >= end) {
        ASSERT(ptr == end);
        goto slowCase;
    }
    if (!WTF::isASCIIAlpha(*ptr))
        goto slowCase;
    ++ptr;
    while (ptr < end) {
        if (!WTF::isASCIIAlphanumeric(*ptr))
            break;
        ++ptr;
    }

    // Here's the shift
    if (ptr < end) {
        if ((!WTF::isASCII(*ptr)) || (*ptr == '\\') || (*ptr == '_') || (*ptr == '$'))
            goto slowCase;
        m_current = *ptr;
    } else
        m_current = 0;

    m_code = ptr;

    // Create the identifier if needed
    if (lexerFlags & LexexFlagsDontBuildKeywords)
        tokenData->ident = 0;
    else
        tokenData->ident = makeIdentifier(start, ptr - start);
    tokenInfo->line = m_lineNumber;
    tokenInfo->startOffset = start - m_codeStart;
    tokenInfo->endOffset = currentOffset();
    m_lastToken = IDENT;
    return IDENT;
    
slowCase:
    return lex(tokenData, tokenInfo, lexerFlags, strictMode);
}

} // namespace JSC

#endif // Lexer_h
Commit	Line	Data
9dae56ea A	1	/*
9dae56ea A	2	* Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
6fe7ccc8	3	* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011, 2012 Apple Inc. All rights reserved.
14957cd0	4	* Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
9dae56ea A	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Library General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2 of the License, or (at your option) any later version.
	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Library General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Library General Public License
	17	* along with this library; see the file COPYING.LIB. If not, write to
	18	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
	19	* Boston, MA 02110-1301, USA.
	20	*
	21	*/
	22
	23	#ifndef Lexer_h
	24	#define Lexer_h
	25
9dae56ea	26	#include "Lookup.h"
f9bf01c6	27	#include "ParserArena.h"
6fe7ccc8	28	#include "ParserTokens.h"
9dae56ea	29	#include "SourceCode.h"
ba379fdc	30	#include <wtf/ASCIICType.h>
14957cd0	31	#include <wtf/AlwaysInline.h>
ba379fdc	32	#include <wtf/SegmentedVector.h>
9dae56ea	33	#include <wtf/Vector.h>
ba379fdc	34	#include <wtf/unicode/Unicode.h>
9dae56ea A	35
	36	namespace JSC {
	37
6fe7ccc8 A	38	class Keywords {
	39	public:
	40	bool isKeyword(const Identifier& ident) const
ba379fdc	41	{
6fe7ccc8	42	return m_keywordTable.entry(m_globalData, ident);
ba379fdc	43	}
6fe7ccc8 A	44
6fe7ccc8 A	45	const HashEntry* getKeyword(const Identifier& ident) const
ba379fdc	46	{
6fe7ccc8	47	return m_keywordTable.entry(m_globalData, ident);
ba379fdc	48	}
6fe7ccc8 A	49
6fe7ccc8 A	50	~Keywords()
ba379fdc	51	{
6fe7ccc8	52	m_keywordTable.deleteTable();
ba379fdc	53	}
6fe7ccc8 A	54
	55	private:
	56	friend class JSGlobalData;
	57
	58	Keywords(JSGlobalData*);
	59
	60	JSGlobalData* m_globalData;
	61	const HashTable m_keywordTable;
	62	};
	63
	64	enum LexerFlags {
	65	LexerFlagsIgnoreReservedWords = 1,
	66	LexerFlagsDontBuildStrings = 2,
	67	LexexFlagsDontBuildKeywords = 4
	68	};
	69
	70	template <typename T>
	71	class Lexer {
	72	WTF_MAKE_NONCOPYABLE(Lexer);
	73	WTF_MAKE_FAST_ALLOCATED;
	74
	75	public:
	76	Lexer(JSGlobalData*);
	77	~Lexer();
	78
	79	// Character manipulation functions.
	80	static bool isWhiteSpace(T character);
	81	static bool isLineTerminator(T character);
	82	static unsigned char convertHex(int c1, int c2);
	83	static UChar convertUnicode(int c1, int c2, int c3, int c4);
	84
	85	// Functions to set up parsing.
	86	void setCode(const SourceCode&, ParserArena*);
	87	void setIsReparsing() { m_isReparsing = true; }
	88	bool isReparsing() const { return m_isReparsing; }
	89
	90	JSTokenType lex(JSTokenData, JSTokenInfo, unsigned, bool strictMode);
	91	bool nextTokenIsColon();
	92	int lineNumber() const { return m_lineNumber; }
	93	void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
	94	int lastLineNumber() const { return m_lastLineNumber; }
	95	bool prevTerminator() const { return m_terminator; }
	96	SourceCode sourceCode(int openBrace, int closeBrace, int firstLine);
	97	bool scanRegExp(const Identifier& pattern, const Identifier& flags, UChar patternPrefix = 0);
	98	bool skipRegExp();
	99
	100	// Functions for use after parsing.
	101	bool sawError() const { return m_error; }
	102	UString getErrorMessage() const { return m_lexErrorMessage; }
	103	void clear();
	104	void setOffset(int offset)
ba379fdc	105	{
6fe7ccc8 A	106	m_error = 0;
	107	m_lexErrorMessage = UString();
	108	m_code = m_codeStart + offset;
	109	m_buffer8.resize(0);
	110	m_buffer16.resize(0);
	111	if (LIKELY(m_code < m_codeEnd))
	112	m_current = *m_code;
	113	else
	114	m_current = 0;
ba379fdc	115	}
6fe7ccc8	116	void setLineNumber(int line)
14957cd0	117	{
6fe7ccc8	118	m_lineNumber = line;
14957cd0	119	}
ba379fdc	120
6fe7ccc8 A	121	SourceProvider* sourceProvider() const { return m_source->provider(); }
	122
	123	JSTokenType lexExpectIdentifier(JSTokenData, JSTokenInfo, unsigned, bool strictMode);
	124
	125	private:
	126	void record8(int);
	127	void append8(const T*, size_t);
	128	void record16(int);
	129	void record16(T);
	130	void append16(const LChar*, size_t);
	131	void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
	132
	133	ALWAYS_INLINE void shift();
	134	ALWAYS_INLINE bool atEnd() const;
	135	ALWAYS_INLINE T peek(int offset) const;
	136	int parseFourDigitUnicodeHex();
	137	void shiftLineTerminator();
	138
	139	UString invalidCharacterMessage() const;
	140	ALWAYS_INLINE const T* currentCharacter() const;
	141	ALWAYS_INLINE int currentOffset() const { return m_code - m_codeStart; }
	142	ALWAYS_INLINE void setOffsetFromCharOffset(const T* charOffset) { setOffset(charOffset - m_codeStart); }
	143
	144	ALWAYS_INLINE void setCodeStart(const StringImpl*);
	145
	146	ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
	147	ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
	148	ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
	149
	150	ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
	151
	152	template <int shiftAmount> void internalShift();
	153	template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
	154	template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, unsigned lexerFlags, bool strictMode);
	155	template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, unsigned lexerFlags, bool strictMode);
	156	template <bool shouldBuildStrings> ALWAYS_INLINE bool parseString(JSTokenData*, bool strictMode);
	157	template <bool shouldBuildStrings> NEVER_INLINE bool parseStringSlowCase(JSTokenData*, bool strictMode);
	158	ALWAYS_INLINE void parseHex(double& returnValue);
	159	ALWAYS_INLINE bool parseOctal(double& returnValue);
	160	ALWAYS_INLINE bool parseDecimal(double& returnValue);
	161	ALWAYS_INLINE void parseNumberAfterDecimalPoint();
	162	ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
	163	ALWAYS_INLINE bool parseMultilineComment();
	164
	165	static const size_t initialReadBufferCapacity = 32;
	166
	167	int m_lineNumber;
	168	int m_lastLineNumber;
	169
	170	Vector<LChar> m_buffer8;
	171	Vector<UChar> m_buffer16;
	172	bool m_terminator;
	173	int m_lastToken;
	174
	175	const SourceCode* m_source;
	176	const T* m_code;
	177	const T* m_codeStart;
	178	const T* m_codeEnd;
	179	bool m_isReparsing;
	180	bool m_atLineStart;
	181	bool m_error;
	182	UString m_lexErrorMessage;
	183
	184	T m_current;
185
186	IdentifierArena* m_arena;
187
188	JSGlobalData* m_globalData;
189	};
190
191	template <>
192	ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
193	{
194	return ch == ' ' \|\| ch == '\t' \|\| ch == 0xB \|\| ch == 0xC \|\| ch == 0xA0;
195	}
196
197	template <>
198	ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
199	{
200	return (ch < 256) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (WTF::Unicode::isSeparatorSpace(ch) \|\| ch == 0xFEFF);
201	}
202
203	template <>
204	ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
205	{
206	return ch == '\r' \|\| ch == '\n';
207	}
208
209	template <>
210	ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
211	{
212	return ch == '\r' \|\| ch == '\n' \|\| (ch & ~1) == 0x2028;
213	}
214
215	template <typename T>
216	inline unsigned char Lexer<T>::convertHex(int c1, int c2)
217	{
218	return (toASCIIHexValue(c1) << 4) \| toASCIIHexValue(c2);
219	}
220
221	template <typename T>
222	inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
223	{
224	return (convertHex(c1, c2) << 8) \| convertHex(c3, c4);
225	}
226
227	template <typename T>
228	ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
229	{
230	return &m_arena->makeIdentifier(m_globalData, characters, length);
231	}
232
233	template <typename T>
234	ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
235	{
236	return &m_arena->makeIdentifier(m_globalData, characters, length);
237	}
238
239	template <>
240	ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringImpl* sourceString)
241	{
242	ASSERT(sourceString->is8Bit());
243	m_codeStart = sourceString->characters8();
244	}
245
246	template <>
247	ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringImpl* sourceString)
248	{
249	ASSERT(!sourceString->is8Bit());
250	m_codeStart = sourceString->characters16();
251	}
252
253	template <typename T>
254	ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
255	{
256	return &m_arena->makeIdentifierLCharFromUChar(m_globalData, characters, length);
257	}
258
259	template <typename T>
260	ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSTokenData* tokenData, JSTokenInfo* tokenInfo, unsigned lexerFlags, bool strictMode)
261	{
262	ASSERT((lexerFlags & LexerFlagsIgnoreReservedWords));
263	const T* start = m_code;
264	const T* ptr = start;
265	const T* end = m_codeEnd;
266	if (ptr >= end) {
267	ASSERT(ptr == end);
268	goto slowCase;
269	}
270	if (!WTF::isASCIIAlpha(*ptr))
271	goto slowCase;
272	++ptr;
273	while (ptr < end) {
274	if (!WTF::isASCIIAlphanumeric(*ptr))
275	break;
14957cd0	276	++ptr;
f9bf01c6 A	277	}
f9bf01c6 A	278
6fe7ccc8 A	279	// Here's the shift
	280	if (ptr < end) {
	281	if ((!WTF::isASCII(ptr)) \|\| (ptr == '\\') \|\| (ptr == '_') \|\| (ptr == '$'))
	282	goto slowCase;
	283	m_current = *ptr;
	284	} else
	285	m_current = 0;
	286
	287	m_code = ptr;
	288
	289	// Create the identifier if needed
	290	if (lexerFlags & LexexFlagsDontBuildKeywords)
	291	tokenData->ident = 0;
	292	else
	293	tokenData->ident = makeIdentifier(start, ptr - start);
	294	tokenInfo->line = m_lineNumber;
	295	tokenInfo->startOffset = start - m_codeStart;
	296	tokenInfo->endOffset = currentOffset();
	297	m_lastToken = IDENT;
	298	return IDENT;
	299
	300	slowCase:
	301	return lex(tokenData, tokenInfo, lexerFlags, strictMode);
	302	}
	303
9dae56ea A	304	} // namespace JSC
	305
	306	#endif // Lexer_h