]> git.saurik.com Git - iphone-api.git/blob - WebCore/HTMLTokenizer.h
Adding the WebCore headers (for Cydget).
[iphone-api.git] / WebCore / HTMLTokenizer.h
1 /*
2 Copyright (C) 1997 Martin Jones (mjones@kde.org)
3 (C) 1997 Torben Weis (weis@kde.org)
4 (C) 1998 Waldo Bastian (bastian@kde.org)
5 (C) 2001 Dirk Mueller (mueller@kde.org)
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
7
8 This library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Library General Public
10 License as published by the Free Software Foundation; either
11 version 2 of the License, or (at your option) any later version.
12
13 This library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Library General Public License for more details.
17
18 You should have received a copy of the GNU Library General Public License
19 along with this library; see the file COPYING.LIB. If not, write to
20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301, USA.
22 */
23
24 #ifndef HTMLTokenizer_h
25 #define HTMLTokenizer_h
26
27 #include "CachedResourceClient.h"
28 #include "CachedResourceHandle.h"
29 #include "NamedMappedAttrMap.h"
30 #include "SegmentedString.h"
31 #include "Timer.h"
32 #include "Tokenizer.h"
33 #include <wtf/Deque.h>
34 #include <wtf/OwnPtr.h>
35 #include <wtf/Vector.h>
36
37 namespace WebCore {
38
39 class CachedScript;
40 class DocumentFragment;
41 class Document;
42 class HTMLDocument;
43 class HTMLScriptElement;
44 class HTMLViewSourceDocument;
45 class FrameView;
46 class HTMLParser;
47 class Node;
48 class PreloadScanner;
49 class ScriptSourceCode;
50
51 extern const double defaultTokenizerTimeDelay;
52
53 /**
54 * @internal
55 * represents one HTML tag. Consists of a numerical id, and the list
56 * of attributes. Can also represent text. In this case the id = 0 and
57 * text contains the text.
58 */
59 struct Token {
60 Token()
61 : beginTag(true)
62 , selfClosingTag(false)
63 , brokenXMLStyle(false)
64 , m_sourceInfo(0)
65 { }
66 ~Token() { }
67
68 void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode);
69
70 bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
71 bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }
72
73 void reset()
74 {
75 attrs = 0;
76 text = 0;
77 tagName = nullAtom;
78 beginTag = true;
79 selfClosingTag = false;
80 brokenXMLStyle = false;
81 if (m_sourceInfo)
82 m_sourceInfo->clear();
83 }
84
85 void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); }
86
87 RefPtr<NamedMappedAttrMap> attrs;
88 RefPtr<StringImpl> text;
89 AtomicString tagName;
90 bool beginTag;
91 bool selfClosingTag;
92 bool brokenXMLStyle;
93 OwnPtr<Vector<UChar> > m_sourceInfo;
94 };
95
96 enum DoctypeState {
97 DoctypeBegin,
98 DoctypeBeforeName,
99 DoctypeName,
100 DoctypeAfterName,
101 DoctypeBeforePublicID,
102 DoctypePublicID,
103 DoctypeAfterPublicID,
104 DoctypeBeforeSystemID,
105 DoctypeSystemID,
106 DoctypeAfterSystemID,
107 DoctypeBogus
108 };
109
110 class DoctypeToken {
111 public:
112 DoctypeToken() {}
113
114 void reset()
115 {
116 m_name.clear();
117 m_publicID.clear();
118 m_systemID.clear();
119 m_state = DoctypeBegin;
120 m_source.clear();
121 }
122
123 DoctypeState state() { return m_state; }
124 void setState(DoctypeState s) { m_state = s; }
125
126 Vector<UChar> m_name;
127 Vector<UChar> m_publicID;
128 Vector<UChar> m_systemID;
129 DoctypeState m_state;
130
131 Vector<UChar> m_source;
132 };
133
134 //-----------------------------------------------------------------------------
135
136 class HTMLTokenizer : public Tokenizer, public CachedResourceClient {
137 public:
138 HTMLTokenizer(HTMLDocument*, bool reportErrors);
139 HTMLTokenizer(HTMLViewSourceDocument*);
140 HTMLTokenizer(DocumentFragment*);
141 virtual ~HTMLTokenizer();
142
143 virtual bool write(const SegmentedString&, bool appendData);
144 virtual void finish();
145 virtual void setForceSynchronous(bool force);
146 virtual bool isWaitingForScripts() const;
147 virtual void stopParsing();
148 virtual bool processingData() const;
149 virtual int executingScript() const { return m_executingScript; }
150 virtual void parsePending();
151
152 virtual int lineNumber() const { return m_lineNumber; }
153 virtual int columnNumber() const { return 1; }
154
155 bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); }
156
157 virtual void executeScriptsWaitingForStylesheets();
158
159 virtual bool isHTMLTokenizer() const { return true; }
160 HTMLParser* htmlParser() const { return m_parser.get(); }
161
162 private:
163 class State;
164
165 // Where we are in parsing a tag
166 void begin();
167 void end();
168
169 void reset();
170
171 PassRefPtr<Node> processToken();
172 void processDoctypeToken();
173
174 State processListing(SegmentedString, State);
175 State parseComment(SegmentedString&, State);
176 State parseDoctype(SegmentedString&, State);
177 State parseServer(SegmentedString&, State);
178 State parseText(SegmentedString&, State);
179 State parseSpecial(SegmentedString&, State);
180 State parseTag(SegmentedString&, State);
181 State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag);
182 State parseProcessingInstruction(SegmentedString&, State);
183 State scriptHandler(State);
184 State scriptExecution(const ScriptSourceCode&, State);
185 void setSrc(const SegmentedString&);
186
187 // check if we have enough space in the buffer.
188 // if not enlarge it
189 inline void checkBuffer(int len = 10)
190 {
191 if ((m_dest - m_buffer) > m_bufferSize - len)
192 enlargeBuffer(len);
193 }
194
195 inline void checkScriptBuffer(int len = 10)
196 {
197 if (m_scriptCodeSize + len >= m_scriptCodeCapacity)
198 enlargeScriptBuffer(len);
199 }
200
201 void enlargeBuffer(int len);
202 void enlargeScriptBuffer(int len);
203
204 bool continueProcessing(int& processedCount, double startTime, State&);
205 void timerFired(Timer<HTMLTokenizer>*);
206 void allDataProcessed();
207
208 // from CachedResourceClient
209 void notifyFinished(CachedResource*);
210
211 // Internal buffers
212 ///////////////////
213 UChar* m_buffer;
214 int m_bufferSize;
215 UChar* m_dest;
216
217 Token m_currentToken;
218
219 // Tokenizer flags
220 //////////////////
221 // are we in quotes within a html tag
222 enum { NoQuote, SingleQuote, DoubleQuote } tquote;
223
224 // Are we in a &... character entity description?
225 enum EntityState {
226 NoEntity = 0,
227 SearchEntity = 1,
228 NumericSearch = 2,
229 Hexadecimal = 3,
230 Decimal = 4,
231 EntityName = 5,
232 SearchSemicolon = 6
233 };
234 unsigned EntityUnicodeValue;
235
236 enum TagState {
237 NoTag = 0,
238 TagName = 1,
239 SearchAttribute = 2,
240 AttributeName = 3,
241 SearchEqual = 4,
242 SearchValue = 5,
243 QuotedValue = 6,
244 Value = 7,
245 SearchEnd = 8
246 };
247
248 class State {
249 public:
250 State() : m_bits(0) { }
251
252 TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
253 void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
254 EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
255 void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }
256
257 bool inScript() const { return testBit(InScript); }
258 void setInScript(bool v) { setBit(InScript, v); }
259 bool inStyle() const { return testBit(InStyle); }
260 void setInStyle(bool v) { setBit(InStyle, v); }
261 bool inXmp() const { return testBit(InXmp); }
262 void setInXmp(bool v) { setBit(InXmp, v); }
263 bool inTitle() const { return testBit(InTitle); }
264 void setInTitle(bool v) { setBit(InTitle, v); }
265 bool inIFrame() const { return testBit(InIFrame); }
266 void setInIFrame(bool v) { setBit(InIFrame, v); }
267 bool inPlainText() const { return testBit(InPlainText); }
268 void setInPlainText(bool v) { setBit(InPlainText, v); }
269 bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
270 void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
271 bool inComment() const { return testBit(InComment); }
272 void setInComment(bool v) { setBit(InComment, v); }
273 bool inDoctype() const { return testBit(InDoctype); }
274 void setInDoctype(bool v) { setBit(InDoctype, v); }
275 bool inTextArea() const { return testBit(InTextArea); }
276 void setInTextArea(bool v) { setBit(InTextArea, v); }
277 bool escaped() const { return testBit(Escaped); }
278 void setEscaped(bool v) { setBit(Escaped, v); }
279 bool inServer() const { return testBit(InServer); }
280 void setInServer(bool v) { setBit(InServer, v); }
281 bool skipLF() const { return testBit(SkipLF); }
282 void setSkipLF(bool v) { setBit(SkipLF, v); }
283 bool startTag() const { return testBit(StartTag); }
284 void setStartTag(bool v) { setBit(StartTag, v); }
285 bool discardLF() const { return testBit(DiscardLF); }
286 void setDiscardLF(bool v) { setBit(DiscardLF, v); }
287 bool allowYield() const { return testBit(AllowYield); }
288 void setAllowYield(bool v) { setBit(AllowYield, v); }
289 bool loadingExtScript() const { return testBit(LoadingExtScript); }
290 void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
291 bool forceSynchronous() const { return testBit(ForceSynchronous); }
292 void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }
293
294 bool inAnySpecial() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); }
295 bool hasTagState() const { return m_bits & TagMask; }
296 bool hasEntityState() const { return m_bits & EntityMask; }
297
298 bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); }
299
300 private:
301 static const int EntityShift = 4;
302 enum StateBits {
303 TagMask = (1 << 4) - 1,
304 EntityMask = (1 << 7) - (1 << 4),
305 InScript = 1 << 7,
306 InStyle = 1 << 8,
307 // Bit 9 unused
308 InXmp = 1 << 10,
309 InTitle = 1 << 11,
310 InPlainText = 1 << 12,
311 InProcessingInstruction = 1 << 13,
312 InComment = 1 << 14,
313 InTextArea = 1 << 15,
314 Escaped = 1 << 16,
315 InServer = 1 << 17,
316 SkipLF = 1 << 18,
317 StartTag = 1 << 19,
318 DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
319 AllowYield = 1 << 21,
320 LoadingExtScript = 1 << 22,
321 ForceSynchronous = 1 << 23,
322 InIFrame = 1 << 24,
323 InDoctype = 1 << 25
324 };
325
326 void setBit(StateBits bit, bool value)
327 {
328 if (value)
329 m_bits |= bit;
330 else
331 m_bits &= ~bit;
332 }
333 bool testBit(StateBits bit) const { return m_bits & bit; }
334
335 unsigned m_bits;
336 };
337
338 State m_state;
339
340 DoctypeToken m_doctypeToken;
341 int m_doctypeSearchCount;
342 int m_doctypeSecondarySearchCount;
343
344 bool m_brokenServer;
345
346 // Name of an attribute that we just scanned.
347 AtomicString m_attrName;
348
349 // Used to store the code of a scripting sequence
350 UChar* m_scriptCode;
351 // Size of the script sequenze stored in @ref #scriptCode
352 int m_scriptCodeSize;
353 // Maximal size that can be stored in @ref #scriptCode
354 int m_scriptCodeCapacity;
355 // resync point of script code size
356 int m_scriptCodeResync;
357
358 // Stores characters if we are scanning for a string like "</script>"
359 UChar searchBuffer[10];
360
361 // Counts where we are in the string we are scanning for
362 int searchCount;
363 // the stopper string
364 const char* m_searchStopper;
365 int m_searchStopperLength;
366
367 // if no more data is coming, just parse what we have (including ext scripts that
368 // may be still downloading) and finish
369 bool m_noMoreData;
370 // URL to get source code of script from
371 String m_scriptTagSrcAttrValue;
372 String m_scriptTagCharsetAttrValue;
373 // the HTML code we will parse after the external script we are waiting for has loaded
374 SegmentedString m_pendingSrc;
375
376 // the HTML code we will parse after this particular script has
377 // loaded, but before all pending HTML
378 SegmentedString* m_currentPrependingSrc;
379
380 // true if we are executing a script while parsing a document. This causes the parsing of
381 // the output of the script to be postponed until after the script has finished executing
382 int m_executingScript;
383 Deque<CachedResourceHandle<CachedScript> > m_pendingScripts;
384 RefPtr<HTMLScriptElement> m_scriptNode;
385
386 bool m_requestingScript;
387 bool m_hasScriptsWaitingForStylesheets;
388
389 // if we found one broken comment, there are most likely others as well
390 // store a flag to get rid of the O(n^2) behaviour in such a case.
391 bool m_brokenComments;
392 // current line number
393 int m_lineNumber;
394 int m_currentScriptTagStartLineNumber;
395 int m_currentTagStartLineNumber;
396
397 double m_tokenizerTimeDelay;
398 int m_tokenizerChunkSize;
399
400 // The timer for continued processing.
401 Timer<HTMLTokenizer> m_timer;
402
403 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
404 // So any fixed number might be too small, but rather than rewriting all usage of this buffer
405 // we'll just make it large enough to handle all imaginable cases.
406 #define CBUFLEN 1024
407 UChar m_cBuffer[CBUFLEN + 2];
408 unsigned int m_cBufferPos;
409
410 SegmentedString m_src;
411 Document* m_doc;
412 OwnPtr<HTMLParser> m_parser;
413 bool m_inWrite;
414 bool m_fragment;
415
416 OwnPtr<PreloadScanner> m_preloadScanner;
417 };
418
419 void parseHTMLDocumentFragment(const String&, DocumentFragment*);
420
421 UChar decodeNamedEntity(const char*);
422
423 } // namespace WebCore
424
425 #endif // HTMLTokenizer_h