]>
Commit | Line | Data |
---|---|---|
d7463f75 JS |
1 | ///////////////////////////////////////////////////////////////////////////// |
2 | // Name: htmlparser.cpp | |
3 | // Purpose: Simple HTML parser | |
4 | // Author: Julian Smart | |
5 | // Modified by: | |
6 | // Created: 2002-09-25 | |
7 | // RCS-ID: $Id$ | |
8 | // Copyright: (c) Julian Smart | |
9 | // Licence: wxWindows license | |
10 | ///////////////////////////////////////////////////////////////////////////// | |
11 | ||
12 | #ifndef _HTMLPARSER_H_ | |
13 | #define _HTMLPARSER_H_ | |
14 | ||
15 | #ifdef __GNUG__ | |
f8105809 | 16 | #pragma interface "htmlparser.cpp" |
d7463f75 JS |
17 | #endif |
18 | ||
19 | //#include "wx/module.h" | |
20 | #include "wx/stream.h" | |
21 | ||
22 | /* | |
23 | * wxSimpleHtmlAttribute | |
24 | * Representation of an attribute | |
25 | */ | |
26 | ||
27 | class wxSimpleHtmlAttribute | |
28 | { | |
29 | friend class wxSimpleHtmlTag; | |
30 | public: | |
31 | wxSimpleHtmlAttribute(const wxString& name, const wxString& value) | |
32 | { | |
33 | m_name = name; m_value = value; m_next = NULL; | |
34 | } | |
35 | //// Operations | |
36 | ||
37 | // Write this attribute | |
38 | void Write(wxOutputStream& stream); | |
39 | ||
40 | //// Accessors | |
41 | const wxString& GetName() const { return m_name; } | |
42 | const wxString& GetValue() const { return m_value; } | |
43 | void SetName(const wxString& name) { m_name = name; } | |
44 | void SetValue(const wxString& value) { m_value = value; } | |
45 | ||
46 | wxSimpleHtmlAttribute* GetNextAttribute() { return m_next; } | |
47 | void SetNextAttribute(wxSimpleHtmlAttribute* attr) { m_next = attr; } | |
48 | ||
49 | bool HasName(const wxString& name) const { return (0 == m_name.CmpNoCase(name)); } | |
50 | bool HasValue(const wxString& val) const { return (0 == m_value.CmpNoCase(val)); } | |
51 | ||
52 | private: | |
53 | wxString m_name; | |
54 | wxString m_value; | |
55 | wxSimpleHtmlAttribute* m_next; | |
56 | }; | |
57 | ||
58 | ||
59 | /* | |
60 | * wxSimpleHtmlTag | |
61 | * Representation of a tag or chunk of text | |
62 | */ | |
63 | ||
64 | enum { wxSimpleHtmlTag_Text, wxSimpleHtmlTag_TopLevel, wxSimpleHtmlTag_Open, wxSimpleHtmlTag_Close, wxSimpleHtmlTag_Directive, wxSimpleHtmlTag_XMLDeclaration }; | |
65 | ||
66 | class wxSimpleHtmlTag | |
67 | { | |
68 | public: | |
69 | wxSimpleHtmlTag(const wxString& tagName, int tagType); | |
70 | ~wxSimpleHtmlTag(); | |
71 | ||
72 | //// Operations | |
73 | void ClearAttributes(); | |
74 | wxSimpleHtmlAttribute* FindAttribute(const wxString& name) const ; | |
75 | void AppendAttribute(const wxString& name, const wxString& value); | |
76 | void ClearChildren(); | |
77 | // Remove 1 tag from the child list. | |
78 | void RemoveChild(wxSimpleHtmlTag *remove); | |
79 | // Appaned tag to the end of the child list. | |
80 | void AppendTag(wxSimpleHtmlTag* tag); | |
81 | // Insert tag after ourself in the parents child list. | |
82 | void AppendTagAfterUs(wxSimpleHtmlTag* tag); | |
83 | // Write this tag | |
84 | void Write(wxOutputStream& stream); | |
85 | ||
86 | // Gets the text from this tag and its descendants | |
87 | wxString GetTagText(); | |
88 | ||
89 | //// Accessors | |
90 | const wxString& GetName() const { return m_name; } | |
91 | void SetName(const wxString& name) { m_name = name; } | |
92 | ||
93 | int GetType() const { return m_type; } | |
94 | void SetType(int t) { m_type = t; } | |
95 | ||
96 | // If type is wxSimpleHtmlTag_Text, m_text will contain some text. | |
97 | const wxString& GetText() const { return m_text; } | |
98 | void SetText(const wxString& text) { m_text = text; } | |
99 | ||
100 | wxSimpleHtmlAttribute* GetFirstAttribute() { return m_attributes; } | |
101 | void SetFirstAttribute(wxSimpleHtmlAttribute* attr) { m_attributes = attr; } | |
102 | ||
103 | int GetAttributeCount() const ; | |
104 | wxSimpleHtmlAttribute* GetAttribute(int i) const ; | |
105 | ||
106 | wxSimpleHtmlTag* GetChildren() const { return m_children; } | |
107 | void SetChildren(wxSimpleHtmlTag* children) { m_children = children; } | |
108 | ||
109 | wxSimpleHtmlTag* GetParent() const { return m_parent; } | |
110 | void SetParent(wxSimpleHtmlTag* parent) { m_parent = parent; } | |
111 | int GetChildCount() const; | |
112 | wxSimpleHtmlTag* GetChild(int i) const; | |
113 | wxSimpleHtmlTag* GetNext() const { return m_next; } | |
114 | ||
115 | //// Convenience accessors & search functions | |
116 | bool NameIs(const wxString& name) { return (m_name.CmpNoCase(name) == 0); } | |
117 | bool HasAttribute(const wxString& name, const wxString& value) const; | |
118 | bool HasAttribute(const wxString& name) const; | |
119 | bool GetAttributeValue(wxString& value, const wxString& attrName); | |
120 | ||
121 | // Search forward from this tag until we find a tag with this name & optionally attribute | |
122 | wxSimpleHtmlTag* FindTag(const wxString& tagName, const wxString& attrName = wxEmptyString); | |
123 | ||
124 | // Gather the text until we hit the given close tag | |
125 | bool FindTextUntilTagClose(wxString& text, const wxString& tagName); | |
126 | ||
127 | private: | |
128 | wxString m_name; | |
129 | int m_type; | |
130 | wxString m_text; | |
131 | wxSimpleHtmlAttribute* m_attributes; | |
132 | ||
133 | // List of children | |
134 | wxSimpleHtmlTag* m_children; | |
135 | wxSimpleHtmlTag* m_next; // Next sibling | |
136 | wxSimpleHtmlTag* m_parent; | |
137 | }; | |
138 | ||
139 | /* | |
140 | * wxSimpleHtmlParser | |
141 | * Simple HTML parser, for such tasks as scanning HTML for keywords, contents, etc. | |
142 | */ | |
143 | ||
144 | class wxSimpleHtmlParser : public wxObject | |
145 | { | |
146 | ||
147 | public: | |
148 | wxSimpleHtmlParser(); | |
149 | ~wxSimpleHtmlParser(); | |
150 | ||
151 | //// Operations | |
152 | bool ParseFile(const wxString& filename); | |
153 | bool ParseString(const wxString& str); | |
154 | void Clear(); | |
155 | // Write this file | |
156 | void Write(wxOutputStream& stream); | |
157 | bool WriteFile(wxString& filename); | |
158 | ||
159 | //// Helpers | |
160 | ||
161 | // Main recursive parsing function | |
162 | bool ParseHtml(wxSimpleHtmlTag* parent); | |
163 | ||
164 | wxSimpleHtmlTag* ParseTagHeader(); | |
165 | wxSimpleHtmlTag* ParseTagClose(); | |
166 | bool ParseAttributes(wxSimpleHtmlTag* tag); | |
167 | wxSimpleHtmlTag* ParseDirective(); // e.g. <!DOCTYPE ....> | |
168 | wxSimpleHtmlTag* ParseXMLDeclaration(); // e.g. <?xml .... ?> | |
169 | bool ParseComment(); // Throw away comments | |
170 | // Plain text, up until an angled bracket | |
171 | bool ParseText(wxString& text); | |
172 | ||
173 | bool EatWhitespace(); // Throw away whitespace | |
174 | bool EatWhitespace(int& pos); // Throw away whitespace: using 'pos' | |
175 | bool ReadString(wxString& str, bool eatIt = FALSE); | |
176 | bool ReadWord(wxString& str, bool eatIt = FALSE); | |
177 | bool ReadNumber(wxString& str, bool eatIt = FALSE); | |
178 | // Could be number, string, whatever, but read up until whitespace. | |
179 | bool ReadLiteral(wxString& str, bool eatIt = FALSE); | |
180 | ||
181 | bool IsComment(); | |
182 | bool IsDirective(); | |
183 | bool IsXMLDeclaration(); | |
184 | bool IsString(); | |
185 | bool IsWord(); | |
186 | bool IsTagClose(); | |
187 | bool IsTagStartBracket(int ch); | |
188 | bool IsTagEndBracket(int ch); | |
189 | bool IsWhitespace(int ch); | |
190 | bool IsAlpha(int ch); | |
191 | bool IsWordChar(int ch); | |
192 | bool IsNumeric(int ch); | |
193 | // Check if a specific tag needs a close tag. If not this function should return FALSE. | |
194 | // If no close tag is needed the result will be that the tag will be insert in a none | |
195 | // hierarchical way. i.e. if the function would return FALSE all the time we would get | |
196 | // a flat list of all tags (like it used to be previously). | |
197 | virtual bool IsCloseTagNeeded(const wxString &name); | |
198 | ||
199 | // Encode/Decode Special Characters like: | |
200 | // > Begins a tag. > | |
201 | // < Ends a tag. < | |
202 | // " Quotation mark. " | |
203 | // ' Apostrophe. ' | |
204 | // & Ampersand. & | |
205 | static void DecodeSpecialChars(wxString &value); | |
206 | static wxString EncodeSpecialChars(const wxString &value); | |
207 | ||
208 | // Matches this string (case insensitive) | |
209 | bool Matches(const wxString& tok, bool eatIt = FALSE) ; | |
210 | bool Eof() const { return (m_pos >= m_length); } | |
211 | bool Eof(int pos) const { return (pos >= m_length); } | |
212 | ||
213 | void SetPosition(int pos) { m_pos = pos; } | |
214 | ||
215 | ||
216 | //// Accessors | |
217 | wxSimpleHtmlTag* GetTopLevelTag() const { return m_topLevel; } | |
218 | ||
219 | // Safe way of getting a character | |
220 | int GetChar(size_t i) const; | |
221 | ||
222 | private: | |
223 | ||
224 | wxSimpleHtmlTag* m_topLevel; | |
225 | int m_pos; // Position in string | |
226 | int m_length; // Length of string | |
227 | wxString m_text; // The actual text | |
228 | ||
229 | }; | |
230 | ||
231 | /* | |
232 | * wxSimpleHtmlTagSpec | |
233 | * Describes a tag, and what type it is. | |
234 | * wxSimpleHtmlModule will initialise/cleanup a list of these, one per tag type | |
235 | */ | |
236 | ||
237 | #if 0 | |
238 | class wxSimpleHtmlTagSpec : public wxObject | |
239 | { | |
240 | ||
241 | public: | |
242 | wxSimpleHtmlTagSpec(const wxString& name, int type); | |
243 | ||
244 | //// Operations | |
245 | static void AddTagSpec(wxSimpleHtmlTagSpec* spec); | |
246 | static void Clear(); | |
247 | ||
248 | //// Accessors | |
249 | const wxString& GetName() const { return m_name; } | |
250 | int GetType() const { return m_type; } | |
251 | ||
252 | private: | |
253 | ||
254 | wxString m_name; | |
255 | int m_type; | |
256 | ||
257 | static wxList* sm_tagSpecs; | |
258 | }; | |
259 | ||
260 | /* | |
261 | * wxSimpleHtmlModule | |
262 | * Responsible for init/cleanup of appropriate data structures | |
263 | */ | |
264 | ||
265 | class wxSimpleHtmlModule : public wxModule | |
266 | { | |
267 | DECLARE_DYNAMIC_CLASS(wxSimpleHtmlModule) | |
268 | ||
269 | public: | |
270 | wxSimpleHtmlModule() {}; | |
271 | ||
272 | bool OnInit() ; | |
273 | void OnExit() ; | |
274 | }; | |
275 | #endif | |
276 | ||
277 | #endif |