]>
Commit | Line | Data |
---|---|---|
c21faa0a VS |
1 | ///////////////////////////////////////////////////////////////////////////// |
2 | // Name: htmlpars.h | |
3 | // Purpose: wx28HtmlParser class (generic parser) | |
4 | // Author: Vaclav Slavik | |
c21faa0a VS |
5 | // Copyright: (c) 1999 Vaclav Slavik |
6 | // Licence: wxWindows licence | |
7 | ///////////////////////////////////////////////////////////////////////////// | |
8 | ||
9 | #ifndef _WX_HTMLPARS_H_ | |
10 | #define _WX_HTMLPARS_H_ | |
11 | ||
12 | #include "wx/defs.h" | |
13 | #include "wx/filesys.h" | |
14 | #include "wx/hash.h" | |
15 | #include "wx/fontenc.h" | |
16 | ||
17 | #include "htmltag.h" | |
18 | ||
19 | class wxMBConv; | |
20 | class wx28HtmlParser; | |
21 | class wx28HtmlTagHandler; | |
22 | class wx28HtmlEntitiesParser; | |
23 | ||
24 | class wx28HtmlTextPieces; | |
25 | class wx28HtmlParserState; | |
26 | ||
27 | ||
28 | enum wx28HtmlURLType | |
29 | { | |
30 | wxHTML_URL_PAGE, | |
31 | wxHTML_URL_IMAGE, | |
32 | wxHTML_URL_OTHER | |
33 | }; | |
34 | ||
35 | // This class handles generic parsing of HTML document : it scans | |
36 | // the document and divides it into blocks of tags (where one block | |
37 | // consists of starting and ending tag and of text between these | |
38 | // 2 tags. | |
39 | class wx28HtmlParser : public wxObject | |
40 | { | |
41 | DECLARE_ABSTRACT_CLASS(wx28HtmlParser) | |
42 | ||
43 | public: | |
44 | wx28HtmlParser(); | |
45 | virtual ~wx28HtmlParser(); | |
46 | ||
47 | // Sets the class which will be used for opening files | |
48 | void SetFS(wxFileSystem *fs) { m_FS = fs; } | |
49 | ||
50 | wxFileSystem* GetFS() const { return m_FS; } | |
51 | ||
52 | // Opens file if the parser is allowed to open given URL (may be forbidden | |
53 | // for security reasons) | |
54 | virtual wxFSFile *OpenURL(wx28HtmlURLType type, const wxString& url) const; | |
55 | ||
56 | // You can simply call this method when you need parsed output. | |
57 | // This method does these things: | |
58 | // 1. call InitParser(source); | |
59 | // 2. call DoParsing(); | |
60 | // 3. call GetProduct(); (its return value is then returned) | |
61 | // 4. call DoneParser(); | |
62 | wxObject* Parse(const wxString& source); | |
63 | ||
64 | // Sets the source. This must be called before running Parse() method. | |
65 | virtual void InitParser(const wxString& source); | |
66 | // This must be called after Parse(). | |
67 | virtual void DoneParser(); | |
68 | ||
69 | // May be called during parsing to immediately return from Parse(). | |
70 | virtual void StopParsing() { m_stopParsing = true; } | |
71 | ||
72 | // Parses the m_Source from begin_pos to end_pos-1. | |
73 | // (in noparams version it parses whole m_Source) | |
74 | void DoParsing(int begin_pos, int end_pos); | |
75 | void DoParsing(); | |
76 | ||
77 | // Returns pointer to the tag at parser's current position | |
78 | wx28HtmlTag *GetCurrentTag() const { return m_CurTag; } | |
79 | ||
80 | // Returns product of parsing | |
81 | // Returned value is result of parsing of the part. The type of this result | |
82 | // depends on internal representation in derived parser | |
83 | // (see wx28HtmlWinParser for details). | |
84 | virtual wxObject* GetProduct() = 0; | |
85 | ||
86 | // adds handler to the list & hash table of handlers. | |
87 | virtual void AddTagHandler(wx28HtmlTagHandler *handler); | |
88 | ||
89 | // Forces the handler to handle additional tags (not returned by GetSupportedTags). | |
90 | // The handler should already be in use by this parser. | |
91 | // Example: you want to parse following pseudo-html structure: | |
92 | // <myitems> | |
93 | // <it name="one" value="1"> | |
94 | // <it name="two" value="2"> | |
95 | // </myitems> | |
96 | // <it> This last it has different meaning, we don't want it to be parsed by myitems handler! | |
97 | // handler can handle only 'myitems' (e.g. its GetSupportedTags returns "MYITEMS") | |
98 | // you can call PushTagHandler(handler, "IT") when you find <myitems> | |
99 | // and call PopTagHandler() when you find </myitems> | |
100 | void PushTagHandler(wx28HtmlTagHandler *handler, const wxString& tags); | |
101 | ||
102 | // Restores state before last call to PushTagHandler | |
103 | void PopTagHandler(); | |
104 | ||
105 | wxString* GetSource() {return &m_Source;} | |
106 | void SetSource(const wxString& src); | |
107 | ||
108 | // Sets HTML source and remembers current parser's state so that it can | |
109 | // later be restored. This is useful for on-line modifications of | |
110 | // HTML source (for example, <pre> handler replaces spaces with | |
111 | // and newlines with <br>) | |
112 | virtual void SetSourceAndSaveState(const wxString& src); | |
113 | // Restores parser's state from stack or returns false if the stack is | |
114 | // empty | |
115 | virtual bool RestoreState(); | |
116 | ||
117 | // Returns HTML source inside the element (i.e. between the starting | |
118 | // and ending tag) | |
119 | wxString GetInnerSource(const wx28HtmlTag& tag); | |
120 | ||
121 | // Parses HTML string 'markup' and extracts charset info from <meta> tag | |
122 | // if present. Returns empty string if the tag is missing. | |
123 | // For wxHTML's internal use. | |
124 | static wxString ExtractCharsetInformation(const wxString& markup); | |
125 | ||
126 | // Returns entity parser object, used to substitute HTML &entities; | |
127 | wx28HtmlEntitiesParser *GetEntitiesParser() const { return m_entitiesParser; } | |
128 | ||
129 | protected: | |
130 | // DOM structure | |
131 | void CreateDOMTree(); | |
132 | void DestroyDOMTree(); | |
133 | void CreateDOMSubTree(wx28HtmlTag *cur, | |
134 | int begin_pos, int end_pos, | |
135 | wx28HtmlTagsCache *cache); | |
136 | ||
137 | // Adds text to the output. | |
4c51a665 | 138 | // This is called from Parse() and must be overridden in derived classes. |
c21faa0a VS |
139 | // txt is not guaranteed to be only one word. It is largest continuous part of text |
140 | // (= not broken by tags) | |
141 | // NOTE : using char* because of speed improvements | |
142 | virtual void AddText(const wxChar* txt) = 0; | |
143 | ||
144 | // Adds tag and proceeds it. Parse() may (and usually is) called from this method. | |
4c51a665 DS |
145 | // This is called from Parse() and may be overridden. |
146 | // Default behaviour is that it looks for proper handler in m_Handlers. The tag is | |
c21faa0a VS |
147 | // ignored if no hander is found. |
148 | // Derived class is *responsible* for filling in m_Handlers table. | |
149 | virtual void AddTag(const wx28HtmlTag& tag); | |
150 | ||
151 | protected: | |
152 | // DOM tree: | |
153 | wx28HtmlTag *m_CurTag; | |
154 | wx28HtmlTag *m_Tags; | |
155 | wx28HtmlTextPieces *m_TextPieces; | |
156 | size_t m_CurTextPiece; | |
157 | ||
158 | wxString m_Source; | |
159 | ||
160 | wx28HtmlParserState *m_SavedStates; | |
161 | ||
162 | // handlers that handle particular tags. The table is accessed by | |
163 | // key = tag's name. | |
164 | // This attribute MUST be filled by derived class otherwise it would | |
165 | // be empty and no tags would be recognized | |
166 | // (see wx28HtmlWinParser for details about filling it) | |
167 | // m_HandlersHash is for random access based on knowledge of tag name (BR, P, etc.) | |
168 | // it may (and often does) contain more references to one object | |
169 | // m_HandlersList is list of all handlers and it is guaranteed to contain | |
170 | // only one reference to each handler instance. | |
171 | wxList m_HandlersList; | |
172 | wxHashTable m_HandlersHash; | |
173 | ||
174 | DECLARE_NO_COPY_CLASS(wx28HtmlParser) | |
175 | ||
176 | // class for opening files (file system) | |
177 | wxFileSystem *m_FS; | |
178 | // handlers stack used by PushTagHandler and PopTagHandler | |
179 | wxList *m_HandlersStack; | |
180 | ||
181 | // entity parse | |
182 | wx28HtmlEntitiesParser *m_entitiesParser; | |
183 | ||
184 | // flag indicating that the parser should stop | |
185 | bool m_stopParsing; | |
186 | }; | |
187 | ||
188 | ||
189 | ||
190 | // This class (and derived classes) cooperates with wx28HtmlParser. | |
191 | // Each recognized tag is passed to handler which is capable | |
192 | // of handling it. Each tag is handled in 3 steps: | |
193 | // 1. Handler will modifies state of parser | |
194 | // (using its public methods) | |
195 | // 2. Parser parses source between starting and ending tag | |
196 | // 3. Handler restores original state of the parser | |
197 | class wx28HtmlTagHandler : public wxObject | |
198 | { | |
199 | DECLARE_ABSTRACT_CLASS(wx28HtmlTagHandler) | |
200 | ||
201 | public: | |
202 | wx28HtmlTagHandler() : wxObject () { m_Parser = NULL; } | |
203 | ||
204 | // Sets the parser. | |
205 | // NOTE : each _instance_ of handler is guaranteed to be called | |
206 | // only by one parser. This means you don't have to care about | |
207 | // reentrancy. | |
208 | virtual void SetParser(wx28HtmlParser *parser) | |
209 | { m_Parser = parser; } | |
210 | ||
211 | // Returns list of supported tags. The list is in uppercase and | |
212 | // tags are delimited by ','. | |
213 | // Example : "I,B,FONT,P" | |
214 | // is capable of handling italic, bold, font and paragraph tags | |
215 | virtual wxString GetSupportedTags() = 0; | |
216 | ||
217 | // This is hadling core method. It does all the Steps 1-3. | |
218 | // To process step 2, you can call ParseInner() | |
219 | // returned value : true if it called ParseInner(), | |
220 | // false etherwise | |
221 | virtual bool HandleTag(const wx28HtmlTag& tag) = 0; | |
222 | ||
223 | protected: | |
224 | // parses input between beginning and ending tag. | |
225 | // m_Parser must be set. | |
226 | void ParseInner(const wx28HtmlTag& tag) | |
227 | { m_Parser->DoParsing(tag.GetBeginPos(), tag.GetEndPos1()); } | |
228 | ||
229 | // Parses given source as if it was tag's inner code (see | |
230 | // wx28HtmlParser::GetInnerSource). Unlike ParseInner(), this method lets | |
231 | // you specify the source code to parse. This is useful when you need to | |
232 | // modify the inner text before parsing. | |
233 | void ParseInnerSource(const wxString& source); | |
234 | ||
235 | wx28HtmlParser *m_Parser; | |
236 | ||
237 | DECLARE_NO_COPY_CLASS(wx28HtmlTagHandler) | |
238 | }; | |
239 | ||
240 | ||
241 | // This class is used to parse HTML entities in strings. It can handle | |
242 | // both named entities and &#xxxx entries where xxxx is Unicode code. | |
243 | class wx28HtmlEntitiesParser : public wxObject | |
244 | { | |
245 | DECLARE_DYNAMIC_CLASS(wx28HtmlEntitiesParser) | |
246 | ||
247 | public: | |
248 | wx28HtmlEntitiesParser(); | |
249 | virtual ~wx28HtmlEntitiesParser(); | |
250 | ||
251 | // Sets encoding of output string. | |
8d94819c | 252 | // Has no effect if wxUSE_UNICODE==1 |
c21faa0a VS |
253 | void SetEncoding(wxFontEncoding encoding); |
254 | ||
255 | // Parses entities in input and replaces them with respective characters | |
256 | // (with respect to output encoding) | |
257 | wxString Parse(const wxString& input); | |
258 | ||
259 | // Returns character for given entity or 0 if the enity is unknown | |
260 | wxChar GetEntityChar(const wxString& entity); | |
261 | ||
262 | // Returns character that represents given Unicode code | |
263 | #if wxUSE_UNICODE | |
264 | wxChar GetCharForCode(unsigned code) { return (wxChar)code; } | |
265 | #else | |
266 | wxChar GetCharForCode(unsigned code); | |
267 | #endif | |
268 | ||
269 | protected: | |
8d94819c | 270 | #if !wxUSE_UNICODE |
c21faa0a VS |
271 | wxMBConv *m_conv; |
272 | wxFontEncoding m_encoding; | |
273 | #endif | |
274 | ||
275 | DECLARE_NO_COPY_CLASS(wx28HtmlEntitiesParser) | |
276 | }; | |
277 | ||
278 | ||
279 | #endif // _WX_HTMLPARS_H_ |