2 Copyright (C) 1997 Martin Jones (mjones@kde.org)
3 (C) 1997 Torben Weis (weis@kde.org)
4 (C) 1998 Waldo Bastian (bastian@kde.org)
5 (C) 2001 Dirk Mueller (mueller@kde.org)
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
8 This library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Library General Public
10 License as published by the Free Software Foundation; either
11 version 2 of the License, or (at your option) any later version.
13 This library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Library General Public License for more details.
18 You should have received a copy of the GNU Library General Public License
19 along with this library; see the file COPYING.LIB. If not, write to
20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301, USA.
24 #ifndef HTMLTokenizer_h
25 #define HTMLTokenizer_h
27 #include "CachedResourceClient.h"
28 #include "CachedResourceHandle.h"
29 #include "NamedMappedAttrMap.h"
30 #include "SegmentedString.h"
32 #include "Tokenizer.h"
33 #include <wtf/Deque.h>
34 #include <wtf/OwnPtr.h>
35 #include <wtf/Vector.h>
40 class DocumentFragment
;
43 class HTMLScriptElement
;
44 class HTMLViewSourceDocument
;
49 class ScriptSourceCode
;
51 extern const double defaultTokenizerTimeDelay
;
55 * represents one HTML tag. Consists of a numerical id, and the list
56 * of attributes. Can also represent text. In this case the id = 0 and
57 * text contains the text.
62 , selfClosingTag(false)
63 , brokenXMLStyle(false)
68 void addAttribute(AtomicString
& attrName
, const AtomicString
& v
, bool viewSourceMode
);
70 bool isOpenTag(const QualifiedName
& fullName
) const { return beginTag
&& fullName
.localName() == tagName
; }
71 bool isCloseTag(const QualifiedName
& fullName
) const { return !beginTag
&& fullName
.localName() == tagName
; }
79 selfClosingTag
= false;
80 brokenXMLStyle
= false;
82 m_sourceInfo
->clear();
85 void addViewSourceChar(UChar c
) { if (!m_sourceInfo
.get()) m_sourceInfo
.set(new Vector
<UChar
>); m_sourceInfo
->append(c
); }
87 RefPtr
<NamedMappedAttrMap
> attrs
;
88 RefPtr
<StringImpl
> text
;
93 OwnPtr
<Vector
<UChar
> > m_sourceInfo
;
101 DoctypeBeforePublicID
,
103 DoctypeAfterPublicID
,
104 DoctypeBeforeSystemID
,
106 DoctypeAfterSystemID
,
119 m_state
= DoctypeBegin
;
123 DoctypeState
state() { return m_state
; }
124 void setState(DoctypeState s
) { m_state
= s
; }
126 Vector
<UChar
> m_name
;
127 Vector
<UChar
> m_publicID
;
128 Vector
<UChar
> m_systemID
;
129 DoctypeState m_state
;
131 Vector
<UChar
> m_source
;
134 //-----------------------------------------------------------------------------
136 class HTMLTokenizer
: public Tokenizer
, public CachedResourceClient
{
138 HTMLTokenizer(HTMLDocument
*, bool reportErrors
);
139 HTMLTokenizer(HTMLViewSourceDocument
*);
140 HTMLTokenizer(DocumentFragment
*);
141 virtual ~HTMLTokenizer();
143 virtual bool write(const SegmentedString
&, bool appendData
);
144 virtual void finish();
145 virtual void setForceSynchronous(bool force
);
146 virtual bool isWaitingForScripts() const;
147 virtual void stopParsing();
148 virtual bool processingData() const;
149 virtual int executingScript() const { return m_executingScript
; }
150 virtual void parsePending();
152 virtual int lineNumber() const { return m_lineNumber
; }
153 virtual int columnNumber() const { return 1; }
155 bool processingContentWrittenByScript() const { return m_src
.excludeLineNumbers(); }
157 virtual void executeScriptsWaitingForStylesheets();
159 virtual bool isHTMLTokenizer() const { return true; }
160 HTMLParser
* htmlParser() const { return m_parser
.get(); }
165 // Where we are in parsing a tag
171 PassRefPtr
<Node
> processToken();
172 void processDoctypeToken();
174 State
processListing(SegmentedString
, State
);
175 State
parseComment(SegmentedString
&, State
);
176 State
parseDoctype(SegmentedString
&, State
);
177 State
parseServer(SegmentedString
&, State
);
178 State
parseText(SegmentedString
&, State
);
179 State
parseSpecial(SegmentedString
&, State
);
180 State
parseTag(SegmentedString
&, State
);
181 State
parseEntity(SegmentedString
&, UChar
*& dest
, State
, unsigned& cBufferPos
, bool start
, bool parsingTag
);
182 State
parseProcessingInstruction(SegmentedString
&, State
);
183 State
scriptHandler(State
);
184 State
scriptExecution(const ScriptSourceCode
&, State
);
185 void setSrc(const SegmentedString
&);
187 // check if we have enough space in the buffer.
189 inline void checkBuffer(int len
= 10)
191 if ((m_dest
- m_buffer
) > m_bufferSize
- len
)
195 inline void checkScriptBuffer(int len
= 10)
197 if (m_scriptCodeSize
+ len
>= m_scriptCodeCapacity
)
198 enlargeScriptBuffer(len
);
201 void enlargeBuffer(int len
);
202 void enlargeScriptBuffer(int len
);
204 bool continueProcessing(int& processedCount
, double startTime
, State
&);
205 void timerFired(Timer
<HTMLTokenizer
>*);
206 void allDataProcessed();
208 // from CachedResourceClient
209 void notifyFinished(CachedResource
*);
217 Token m_currentToken
;
221 // are we in quotes within a html tag
222 enum { NoQuote
, SingleQuote
, DoubleQuote
} tquote
;
224 // Are we in a &... character entity description?
234 unsigned EntityUnicodeValue
;
250 State() : m_bits(0) { }
252 TagState
tagState() const { return static_cast<TagState
>(m_bits
& TagMask
); }
253 void setTagState(TagState t
) { m_bits
= (m_bits
& ~TagMask
) | t
; }
254 EntityState
entityState() const { return static_cast<EntityState
>((m_bits
& EntityMask
) >> EntityShift
); }
255 void setEntityState(EntityState e
) { m_bits
= (m_bits
& ~EntityMask
) | (e
<< EntityShift
); }
257 bool inScript() const { return testBit(InScript
); }
258 void setInScript(bool v
) { setBit(InScript
, v
); }
259 bool inStyle() const { return testBit(InStyle
); }
260 void setInStyle(bool v
) { setBit(InStyle
, v
); }
261 bool inXmp() const { return testBit(InXmp
); }
262 void setInXmp(bool v
) { setBit(InXmp
, v
); }
263 bool inTitle() const { return testBit(InTitle
); }
264 void setInTitle(bool v
) { setBit(InTitle
, v
); }
265 bool inIFrame() const { return testBit(InIFrame
); }
266 void setInIFrame(bool v
) { setBit(InIFrame
, v
); }
267 bool inPlainText() const { return testBit(InPlainText
); }
268 void setInPlainText(bool v
) { setBit(InPlainText
, v
); }
269 bool inProcessingInstruction() const { return testBit(InProcessingInstruction
); }
270 void setInProcessingInstruction(bool v
) { return setBit(InProcessingInstruction
, v
); }
271 bool inComment() const { return testBit(InComment
); }
272 void setInComment(bool v
) { setBit(InComment
, v
); }
273 bool inDoctype() const { return testBit(InDoctype
); }
274 void setInDoctype(bool v
) { setBit(InDoctype
, v
); }
275 bool inTextArea() const { return testBit(InTextArea
); }
276 void setInTextArea(bool v
) { setBit(InTextArea
, v
); }
277 bool escaped() const { return testBit(Escaped
); }
278 void setEscaped(bool v
) { setBit(Escaped
, v
); }
279 bool inServer() const { return testBit(InServer
); }
280 void setInServer(bool v
) { setBit(InServer
, v
); }
281 bool skipLF() const { return testBit(SkipLF
); }
282 void setSkipLF(bool v
) { setBit(SkipLF
, v
); }
283 bool startTag() const { return testBit(StartTag
); }
284 void setStartTag(bool v
) { setBit(StartTag
, v
); }
285 bool discardLF() const { return testBit(DiscardLF
); }
286 void setDiscardLF(bool v
) { setBit(DiscardLF
, v
); }
287 bool allowYield() const { return testBit(AllowYield
); }
288 void setAllowYield(bool v
) { setBit(AllowYield
, v
); }
289 bool loadingExtScript() const { return testBit(LoadingExtScript
); }
290 void setLoadingExtScript(bool v
) { setBit(LoadingExtScript
, v
); }
291 bool forceSynchronous() const { return testBit(ForceSynchronous
); }
292 void setForceSynchronous(bool v
) { setBit(ForceSynchronous
, v
); }
294 bool inAnySpecial() const { return m_bits
& (InScript
| InStyle
| InXmp
| InTextArea
| InTitle
| InIFrame
); }
295 bool hasTagState() const { return m_bits
& TagMask
; }
296 bool hasEntityState() const { return m_bits
& EntityMask
; }
298 bool needsSpecialWriteHandling() const { return m_bits
& (InScript
| InStyle
| InXmp
| InTextArea
| InTitle
| InIFrame
| TagMask
| EntityMask
| InPlainText
| InComment
| InDoctype
| InServer
| InProcessingInstruction
| StartTag
); }
301 static const int EntityShift
= 4;
303 TagMask
= (1 << 4) - 1,
304 EntityMask
= (1 << 7) - (1 << 4),
310 InPlainText
= 1 << 12,
311 InProcessingInstruction
= 1 << 13,
313 InTextArea
= 1 << 15,
318 DiscardLF
= 1 << 20, // FIXME: should clarify difference between skip and discard
319 AllowYield
= 1 << 21,
320 LoadingExtScript
= 1 << 22,
321 ForceSynchronous
= 1 << 23,
326 void setBit(StateBits bit
, bool value
)
333 bool testBit(StateBits bit
) const { return m_bits
& bit
; }
340 DoctypeToken m_doctypeToken
;
341 int m_doctypeSearchCount
;
342 int m_doctypeSecondarySearchCount
;
346 // Name of an attribute that we just scanned.
347 AtomicString m_attrName
;
349 // Used to store the code of a scripting sequence
351 // Size of the script sequenze stored in @ref #scriptCode
352 int m_scriptCodeSize
;
353 // Maximal size that can be stored in @ref #scriptCode
354 int m_scriptCodeCapacity
;
355 // resync point of script code size
356 int m_scriptCodeResync
;
358 // Stores characters if we are scanning for a string like "</script>"
359 UChar searchBuffer
[10];
361 // Counts where we are in the string we are scanning for
363 // the stopper string
364 const char* m_searchStopper
;
365 int m_searchStopperLength
;
367 // if no more data is coming, just parse what we have (including ext scripts that
368 // may be still downloading) and finish
370 // URL to get source code of script from
371 String m_scriptTagSrcAttrValue
;
372 String m_scriptTagCharsetAttrValue
;
373 // the HTML code we will parse after the external script we are waiting for has loaded
374 SegmentedString m_pendingSrc
;
376 // the HTML code we will parse after this particular script has
377 // loaded, but before all pending HTML
378 SegmentedString
* m_currentPrependingSrc
;
380 // true if we are executing a script while parsing a document. This causes the parsing of
381 // the output of the script to be postponed until after the script has finished executing
382 int m_executingScript
;
383 Deque
<CachedResourceHandle
<CachedScript
> > m_pendingScripts
;
384 RefPtr
<HTMLScriptElement
> m_scriptNode
;
386 bool m_requestingScript
;
387 bool m_hasScriptsWaitingForStylesheets
;
389 // if we found one broken comment, there are most likely others as well
390 // store a flag to get rid of the O(n^2) behaviour in such a case.
391 bool m_brokenComments
;
392 // current line number
394 int m_currentScriptTagStartLineNumber
;
395 int m_currentTagStartLineNumber
;
397 double m_tokenizerTimeDelay
;
398 int m_tokenizerChunkSize
;
400 // The timer for continued processing.
401 Timer
<HTMLTokenizer
> m_timer
;
403 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
404 // So any fixed number might be too small, but rather than rewriting all usage of this buffer
405 // we'll just make it large enough to handle all imaginable cases.
407 UChar m_cBuffer
[CBUFLEN
+ 2];
408 unsigned int m_cBufferPos
;
410 SegmentedString m_src
;
412 OwnPtr
<HTMLParser
> m_parser
;
416 OwnPtr
<PreloadScanner
> m_preloadScanner
;
419 void parseHTMLDocumentFragment(const String
&, DocumentFragment
*);
421 UChar
decodeNamedEntity(const char*);
423 } // namespace WebCore
425 #endif // HTMLTokenizer_h