Commit | Line | Data |
---|---|---|
5526e819 | 1 | ///////////////////////////////////////////////////////////////////////////// |
69941f05 | 2 | // Name: htmlpars.h |
5526e819 VS |
3 | // Purpose: wxHtmlParser class (generic parser) |
4 | // Author: Vaclav Slavik | |
69941f05 | 5 | // RCS-ID: $Id$ |
5526e819 | 6 | // Copyright: (c) 1999 Vaclav Slavik |
65571936 | 7 | // Licence: wxWindows licence |
5526e819 VS |
8 | ///////////////////////////////////////////////////////////////////////////// |
9 | ||
69941f05 VS |
10 | #ifndef _WX_HTMLPARS_H_ |
11 | #define _WX_HTMLPARS_H_ | |
5526e819 | 12 | |
5526e819 VS |
13 | #include "wx/defs.h" |
14 | #if wxUSE_HTML | |
15 | ||
69941f05 VS |
16 | #include "wx/html/htmltag.h" |
17 | #include "wx/filesys.h" | |
2826ef0c VS |
18 | #include "wx/hashmap.h" |
19 | #include "wx/hashset.h" | |
20 | #include "wx/vector.h" | |
fc1f2125 | 21 | #include "wx/fontenc.h" |
5526e819 | 22 | |
b5dbe15d VS |
23 | class WXDLLIMPEXP_FWD_BASE wxMBConv; |
24 | class WXDLLIMPEXP_FWD_HTML wxHtmlParser; | |
25 | class WXDLLIMPEXP_FWD_HTML wxHtmlTagHandler; | |
26 | class WXDLLIMPEXP_FWD_HTML wxHtmlEntitiesParser; | |
daa616fc | 27 | |
6c62a62b VS |
28 | class wxHtmlTextPieces; |
29 | class wxHtmlParserState; | |
30 | ||
2826ef0c VS |
31 | WX_DECLARE_HASH_SET_WITH_DECL(wxHtmlTagHandler*, |
32 | wxPointerHash, wxPointerEqual, | |
33 | wxHtmlTagHandlersSet, | |
34 | class WXDLLIMPEXP_HTML); | |
35 | WX_DECLARE_STRING_HASH_MAP_WITH_DECL(wxHtmlTagHandler*, | |
36 | wxHtmlTagHandlersHash, | |
37 | class WXDLLIMPEXP_HTML); | |
38 | ||
6cc4e6b8 VS |
39 | |
40 | enum wxHtmlURLType | |
41 | { | |
42 | wxHTML_URL_PAGE, | |
43 | wxHTML_URL_IMAGE, | |
44 | wxHTML_URL_OTHER | |
45 | }; | |
46 | ||
daa616fc | 47 | // This class handles generic parsing of HTML document : it scans |
6a17b868 | 48 | // the document and divides it into blocks of tags (where one block |
daa616fc VS |
49 | // consists of starting and ending tag and of text between these |
50 | // 2 tags. | |
6acba9a7 | 51 | class WXDLLIMPEXP_HTML wxHtmlParser : public wxObject |
5526e819 VS |
52 | { |
53 | DECLARE_ABSTRACT_CLASS(wxHtmlParser) | |
54 | ||
1309ba6c | 55 | public: |
daa616fc | 56 | wxHtmlParser(); |
1309ba6c VS |
57 | virtual ~wxHtmlParser(); |
58 | ||
59 | // Sets the class which will be used for opening files | |
60 | void SetFS(wxFileSystem *fs) { m_FS = fs; } | |
61 | ||
62 | wxFileSystem* GetFS() const { return m_FS; } | |
63 | ||
6cc4e6b8 | 64 | // Opens file if the parser is allowed to open given URL (may be forbidden |
6953da00 | 65 | // for security reasons) |
6cc4e6b8 | 66 | virtual wxFSFile *OpenURL(wxHtmlURLType type, const wxString& url) const; |
04db5c3f | 67 | |
1309ba6c VS |
68 | // You can simply call this method when you need parsed output. |
69 | // This method does these things: | |
70 | // 1. call InitParser(source); | |
71 | // 2. call DoParsing(); | |
6a17b868 | 72 | // 3. call GetProduct(); (its return value is then returned) |
1309ba6c VS |
73 | // 4. call DoneParser(); |
74 | wxObject* Parse(const wxString& source); | |
75 | ||
76 | // Sets the source. This must be called before running Parse() method. | |
77 | virtual void InitParser(const wxString& source); | |
78 | // This must be called after Parse(). | |
79 | virtual void DoneParser(); | |
6953da00 | 80 | |
2b5f62a0 | 81 | // May be called during parsing to immediately return from Parse(). |
6953da00 | 82 | virtual void StopParsing() { m_stopParsing = true; } |
1309ba6c VS |
83 | |
84 | // Parses the m_Source from begin_pos to end_pos-1. | |
85 | // (in noparams version it parses whole m_Source) | |
b1a3a964 VS |
86 | void DoParsing(const wxString::const_iterator& begin_pos, |
87 | const wxString::const_iterator& end_pos); | |
6c62a62b VS |
88 | void DoParsing(); |
89 | ||
90 | // Returns pointer to the tag at parser's current position | |
91 | wxHtmlTag *GetCurrentTag() const { return m_CurTag; } | |
1309ba6c VS |
92 | |
93 | // Returns product of parsing | |
94 | // Returned value is result of parsing of the part. The type of this result | |
95 | // depends on internal representation in derived parser | |
96 | // (see wxHtmlWinParser for details). | |
97 | virtual wxObject* GetProduct() = 0; | |
98 | ||
99 | // adds handler to the list & hash table of handlers. | |
100 | virtual void AddTagHandler(wxHtmlTagHandler *handler); | |
101 | ||
6953da00 | 102 | // Forces the handler to handle additional tags (not returned by GetSupportedTags). |
1309ba6c VS |
103 | // The handler should already be in use by this parser. |
104 | // Example: you want to parse following pseudo-html structure: | |
105 | // <myitems> | |
106 | // <it name="one" value="1"> | |
107 | // <it name="two" value="2"> | |
108 | // </myitems> | |
109 | // <it> This last it has different meaning, we don't want it to be parsed by myitems handler! | |
6a17b868 | 110 | // handler can handle only 'myitems' (e.g. its GetSupportedTags returns "MYITEMS") |
1309ba6c VS |
111 | // you can call PushTagHandler(handler, "IT") when you find <myitems> |
112 | // and call PopTagHandler() when you find </myitems> | |
fbfb8bcc | 113 | void PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags); |
1309ba6c VS |
114 | |
115 | // Restores state before last call to PushTagHandler | |
116 | void PopTagHandler(); | |
117 | ||
b1a3a964 | 118 | const wxString* GetSource() {return m_Source;} |
1309ba6c | 119 | void SetSource(const wxString& src); |
6953da00 | 120 | |
6a17b868 | 121 | // Sets HTML source and remembers current parser's state so that it can |
6953da00 | 122 | // later be restored. This is useful for on-line modifications of |
6c62a62b VS |
123 | // HTML source (for example, <pre> handler replaces spaces with |
124 | // and newlines with <br>) | |
125 | virtual void SetSourceAndSaveState(const wxString& src); | |
6953da00 | 126 | // Restores parser's state from stack or returns false if the stack is |
6c62a62b VS |
127 | // empty |
128 | virtual bool RestoreState(); | |
6953da00 | 129 | |
e7feeafa VS |
130 | // Returns HTML source inside the element (i.e. between the starting |
131 | // and ending tag) | |
132 | wxString GetInnerSource(const wxHtmlTag& tag); | |
133 | ||
2b5f62a0 VZ |
134 | // Parses HTML string 'markup' and extracts charset info from <meta> tag |
135 | // if present. Returns empty string if the tag is missing. | |
136 | // For wxHTML's internal use. | |
137 | static wxString ExtractCharsetInformation(const wxString& markup); | |
6953da00 | 138 | |
caea1cb7 VS |
139 | // Returns entity parser object, used to substitute HTML &entities; |
140 | wxHtmlEntitiesParser *GetEntitiesParser() const { return m_entitiesParser; } | |
1309ba6c | 141 | |
4609ee2e VZ |
142 | // Returns true if the tag starting at the given position is a comment tag |
143 | // | |
144 | // p should point to '<' character and is modified to point to the closing | |
145 | // '>' of the end comment tag if this is indeed a comment | |
146 | static bool | |
147 | SkipCommentTag(wxString::const_iterator& p, wxString::const_iterator end); | |
148 | ||
1309ba6c | 149 | protected: |
6c62a62b VS |
150 | // DOM structure |
151 | void CreateDOMTree(); | |
152 | void DestroyDOMTree(); | |
153 | void CreateDOMSubTree(wxHtmlTag *cur, | |
b1a3a964 VS |
154 | const wxString::const_iterator& begin_pos, |
155 | const wxString::const_iterator& end_pos, | |
6c62a62b VS |
156 | wxHtmlTagsCache *cache); |
157 | ||
1309ba6c VS |
158 | // Adds text to the output. |
159 | // This is called from Parse() and must be overriden in derived classes. | |
5bce3e6f VS |
160 | // txt is not guaranteed to be only one word. It is largest continuous part |
161 | // of text (= not broken by tags) | |
162 | virtual void AddText(const wxString& txt) = 0; | |
1309ba6c VS |
163 | |
164 | // Adds tag and proceeds it. Parse() may (and usually is) called from this method. | |
165 | // This is called from Parse() and may be overriden. | |
166 | // Default behavior is that it looks for proper handler in m_Handlers. The tag is | |
167 | // ignored if no hander is found. | |
168 | // Derived class is *responsible* for filling in m_Handlers table. | |
169 | virtual void AddTag(const wxHtmlTag& tag); | |
170 | ||
171 | protected: | |
6c62a62b VS |
172 | // DOM tree: |
173 | wxHtmlTag *m_CurTag; | |
174 | wxHtmlTag *m_Tags; | |
175 | wxHtmlTextPieces *m_TextPieces; | |
176 | size_t m_CurTextPiece; | |
1309ba6c | 177 | |
b1a3a964 | 178 | const wxString *m_Source; |
6953da00 | 179 | |
6c62a62b | 180 | wxHtmlParserState *m_SavedStates; |
6953da00 | 181 | |
1309ba6c VS |
182 | // handlers that handle particular tags. The table is accessed by |
183 | // key = tag's name. | |
184 | // This attribute MUST be filled by derived class otherwise it would | |
185 | // be empty and no tags would be recognized | |
186 | // (see wxHtmlWinParser for details about filling it) | |
187 | // m_HandlersHash is for random access based on knowledge of tag name (BR, P, etc.) | |
188 | // it may (and often does) contain more references to one object | |
189 | // m_HandlersList is list of all handlers and it is guaranteed to contain | |
190 | // only one reference to each handler instance. | |
2826ef0c VS |
191 | wxHtmlTagHandlersSet m_HandlersSet; |
192 | wxHtmlTagHandlersHash m_HandlersHash; | |
1309ba6c | 193 | |
22f3361e VZ |
194 | DECLARE_NO_COPY_CLASS(wxHtmlParser) |
195 | ||
1309ba6c VS |
196 | // class for opening files (file system) |
197 | wxFileSystem *m_FS; | |
198 | // handlers stack used by PushTagHandler and PopTagHandler | |
2826ef0c | 199 | wxVector<wxHtmlTagHandlersHash*> m_HandlersStack; |
6953da00 | 200 | |
daa616fc VS |
201 | // entity parse |
202 | wxHtmlEntitiesParser *m_entitiesParser; | |
6953da00 | 203 | |
2b5f62a0 VZ |
204 | // flag indicating that the parser should stop |
205 | bool m_stopParsing; | |
5526e819 VS |
206 | }; |
207 | ||
208 | ||
209 | ||
daa616fc VS |
210 | // This class (and derived classes) cooperates with wxHtmlParser. |
211 | // Each recognized tag is passed to handler which is capable | |
212 | // of handling it. Each tag is handled in 3 steps: | |
213 | // 1. Handler will modifies state of parser | |
6a17b868 | 214 | // (using its public methods) |
daa616fc VS |
215 | // 2. Parser parses source between starting and ending tag |
216 | // 3. Handler restores original state of the parser | |
6acba9a7 | 217 | class WXDLLIMPEXP_HTML wxHtmlTagHandler : public wxObject |
5526e819 VS |
218 | { |
219 | DECLARE_ABSTRACT_CLASS(wxHtmlTagHandler) | |
220 | ||
1309ba6c VS |
221 | public: |
222 | wxHtmlTagHandler() : wxObject () { m_Parser = NULL; } | |
223 | ||
224 | // Sets the parser. | |
225 | // NOTE : each _instance_ of handler is guaranteed to be called | |
226 | // only by one parser. This means you don't have to care about | |
227 | // reentrancy. | |
6953da00 | 228 | virtual void SetParser(wxHtmlParser *parser) |
1309ba6c VS |
229 | { m_Parser = parser; } |
230 | ||
231 | // Returns list of supported tags. The list is in uppercase and | |
232 | // tags are delimited by ','. | |
233 | // Example : "I,B,FONT,P" | |
234 | // is capable of handling italic, bold, font and paragraph tags | |
235 | virtual wxString GetSupportedTags() = 0; | |
236 | ||
237 | // This is hadling core method. It does all the Steps 1-3. | |
238 | // To process step 2, you can call ParseInner() | |
6953da00 WS |
239 | // returned value : true if it called ParseInner(), |
240 | // false etherwise | |
1309ba6c VS |
241 | virtual bool HandleTag(const wxHtmlTag& tag) = 0; |
242 | ||
243 | protected: | |
244 | // parses input between beginning and ending tag. | |
245 | // m_Parser must be set. | |
6953da00 | 246 | void ParseInner(const wxHtmlTag& tag) |
b1a3a964 | 247 | { m_Parser->DoParsing(tag.GetBeginIter(), tag.GetEndIter1()); } |
1309ba6c | 248 | |
e7feeafa VS |
249 | // Parses given source as if it was tag's inner code (see |
250 | // wxHtmlParser::GetInnerSource). Unlike ParseInner(), this method lets | |
251 | // you specify the source code to parse. This is useful when you need to | |
252 | // modify the inner text before parsing. | |
253 | void ParseInnerSource(const wxString& source); | |
254 | ||
1309ba6c | 255 | wxHtmlParser *m_Parser; |
22f3361e VZ |
256 | |
257 | DECLARE_NO_COPY_CLASS(wxHtmlTagHandler) | |
5526e819 VS |
258 | }; |
259 | ||
260 | ||
daa616fc VS |
261 | // This class is used to parse HTML entities in strings. It can handle |
262 | // both named entities and &#xxxx entries where xxxx is Unicode code. | |
6acba9a7 | 263 | class WXDLLIMPEXP_HTML wxHtmlEntitiesParser : public wxObject |
daa616fc VS |
264 | { |
265 | DECLARE_DYNAMIC_CLASS(wxHtmlEntitiesParser) | |
266 | ||
267 | public: | |
268 | wxHtmlEntitiesParser(); | |
269 | virtual ~wxHtmlEntitiesParser(); | |
6953da00 | 270 | |
daa616fc VS |
271 | // Sets encoding of output string. |
272 | // Has no effect if wxUSE_WCHAR_T==0 or wxUSE_UNICODE==1 | |
273 | void SetEncoding(wxFontEncoding encoding); | |
6953da00 | 274 | |
daa616fc VS |
275 | // Parses entities in input and replaces them with respective characters |
276 | // (with respect to output encoding) | |
96d665d2 | 277 | wxString Parse(const wxString& input) const; |
61b50a43 | 278 | |
f5e6ed7c | 279 | // Returns character for given entity or 0 if the enity is unknown |
96d665d2 | 280 | wxChar GetEntityChar(const wxString& entity) const; |
61b50a43 VS |
281 | |
282 | // Returns character that represents given Unicode code | |
2b5f62a0 | 283 | #if wxUSE_UNICODE |
96d665d2 | 284 | wxChar GetCharForCode(unsigned code) const { return (wxChar)code; } |
2b5f62a0 | 285 | #else |
96d665d2 | 286 | wxChar GetCharForCode(unsigned code) const; |
2b5f62a0 | 287 | #endif |
daa616fc | 288 | |
61b50a43 | 289 | protected: |
daa616fc VS |
290 | #if wxUSE_WCHAR_T && !wxUSE_UNICODE |
291 | wxMBConv *m_conv; | |
292 | wxFontEncoding m_encoding; | |
293 | #endif | |
22f3361e VZ |
294 | |
295 | DECLARE_NO_COPY_CLASS(wxHtmlEntitiesParser) | |
daa616fc | 296 | }; |
5526e819 VS |
297 | |
298 | ||
5526e819 | 299 | #endif |
69941f05 VS |
300 | |
301 | #endif // _WX_HTMLPARS_H_ |