]>
Commit | Line | Data |
---|---|---|
d7463f75 JS |
1 | ///////////////////////////////////////////////////////////////////////////// |
2 | // Name: htmlparser.cpp | |
3 | // Purpose: Simple HTML parser | |
4 | // Author: Julian Smart | |
5 | // Modified by: | |
6 | // Created: 2002-09-25 | |
7 | // RCS-ID: $Id$ | |
8 | // Copyright: (c) Julian Smart | |
9 | // Licence: wxWindows license | |
10 | ///////////////////////////////////////////////////////////////////////////// | |
11 | ||
12 | #ifndef _HTMLPARSER_H_ | |
13 | #define _HTMLPARSER_H_ | |
14 | ||
d7463f75 JS |
15 | //#include "wx/module.h" |
16 | #include "wx/stream.h" | |
17 | ||
18 | /* | |
19 | * wxSimpleHtmlAttribute | |
20 | * Representation of an attribute | |
21 | */ | |
22 | ||
23 | class wxSimpleHtmlAttribute | |
24 | { | |
25 | friend class wxSimpleHtmlTag; | |
26 | public: | |
27 | wxSimpleHtmlAttribute(const wxString& name, const wxString& value) | |
28 | { | |
29 | m_name = name; m_value = value; m_next = NULL; | |
30 | } | |
31 | //// Operations | |
32 | ||
33 | // Write this attribute | |
34 | void Write(wxOutputStream& stream); | |
35 | ||
36 | //// Accessors | |
37 | const wxString& GetName() const { return m_name; } | |
38 | const wxString& GetValue() const { return m_value; } | |
39 | void SetName(const wxString& name) { m_name = name; } | |
40 | void SetValue(const wxString& value) { m_value = value; } | |
41 | ||
42 | wxSimpleHtmlAttribute* GetNextAttribute() { return m_next; } | |
43 | void SetNextAttribute(wxSimpleHtmlAttribute* attr) { m_next = attr; } | |
44 | ||
45 | bool HasName(const wxString& name) const { return (0 == m_name.CmpNoCase(name)); } | |
46 | bool HasValue(const wxString& val) const { return (0 == m_value.CmpNoCase(val)); } | |
47 | ||
48 | private: | |
49 | wxString m_name; | |
50 | wxString m_value; | |
51 | wxSimpleHtmlAttribute* m_next; | |
52 | }; | |
53 | ||
54 | ||
55 | /* | |
56 | * wxSimpleHtmlTag | |
57 | * Representation of a tag or chunk of text | |
58 | */ | |
59 | ||
60 | enum { wxSimpleHtmlTag_Text, wxSimpleHtmlTag_TopLevel, wxSimpleHtmlTag_Open, wxSimpleHtmlTag_Close, wxSimpleHtmlTag_Directive, wxSimpleHtmlTag_XMLDeclaration }; | |
61 | ||
62 | class wxSimpleHtmlTag | |
63 | { | |
64 | public: | |
65 | wxSimpleHtmlTag(const wxString& tagName, int tagType); | |
66 | ~wxSimpleHtmlTag(); | |
67 | ||
68 | //// Operations | |
69 | void ClearAttributes(); | |
70 | wxSimpleHtmlAttribute* FindAttribute(const wxString& name) const ; | |
71 | void AppendAttribute(const wxString& name, const wxString& value); | |
72 | void ClearChildren(); | |
73 | // Remove 1 tag from the child list. | |
74 | void RemoveChild(wxSimpleHtmlTag *remove); | |
75 | // Appaned tag to the end of the child list. | |
76 | void AppendTag(wxSimpleHtmlTag* tag); | |
77 | // Insert tag after ourself in the parents child list. | |
78 | void AppendTagAfterUs(wxSimpleHtmlTag* tag); | |
79 | // Write this tag | |
80 | void Write(wxOutputStream& stream); | |
81 | ||
82 | // Gets the text from this tag and its descendants | |
83 | wxString GetTagText(); | |
84 | ||
85 | //// Accessors | |
86 | const wxString& GetName() const { return m_name; } | |
87 | void SetName(const wxString& name) { m_name = name; } | |
88 | ||
89 | int GetType() const { return m_type; } | |
90 | void SetType(int t) { m_type = t; } | |
91 | ||
92 | // If type is wxSimpleHtmlTag_Text, m_text will contain some text. | |
93 | const wxString& GetText() const { return m_text; } | |
94 | void SetText(const wxString& text) { m_text = text; } | |
95 | ||
96 | wxSimpleHtmlAttribute* GetFirstAttribute() { return m_attributes; } | |
97 | void SetFirstAttribute(wxSimpleHtmlAttribute* attr) { m_attributes = attr; } | |
98 | ||
99 | int GetAttributeCount() const ; | |
100 | wxSimpleHtmlAttribute* GetAttribute(int i) const ; | |
101 | ||
102 | wxSimpleHtmlTag* GetChildren() const { return m_children; } | |
103 | void SetChildren(wxSimpleHtmlTag* children) { m_children = children; } | |
104 | ||
105 | wxSimpleHtmlTag* GetParent() const { return m_parent; } | |
106 | void SetParent(wxSimpleHtmlTag* parent) { m_parent = parent; } | |
107 | int GetChildCount() const; | |
108 | wxSimpleHtmlTag* GetChild(int i) const; | |
109 | wxSimpleHtmlTag* GetNext() const { return m_next; } | |
110 | ||
111 | //// Convenience accessors & search functions | |
112 | bool NameIs(const wxString& name) { return (m_name.CmpNoCase(name) == 0); } | |
113 | bool HasAttribute(const wxString& name, const wxString& value) const; | |
114 | bool HasAttribute(const wxString& name) const; | |
115 | bool GetAttributeValue(wxString& value, const wxString& attrName); | |
116 | ||
117 | // Search forward from this tag until we find a tag with this name & optionally attribute | |
118 | wxSimpleHtmlTag* FindTag(const wxString& tagName, const wxString& attrName = wxEmptyString); | |
119 | ||
120 | // Gather the text until we hit the given close tag | |
121 | bool FindTextUntilTagClose(wxString& text, const wxString& tagName); | |
122 | ||
123 | private: | |
124 | wxString m_name; | |
125 | int m_type; | |
126 | wxString m_text; | |
127 | wxSimpleHtmlAttribute* m_attributes; | |
128 | ||
129 | // List of children | |
130 | wxSimpleHtmlTag* m_children; | |
131 | wxSimpleHtmlTag* m_next; // Next sibling | |
132 | wxSimpleHtmlTag* m_parent; | |
133 | }; | |
134 | ||
135 | /* | |
136 | * wxSimpleHtmlParser | |
137 | * Simple HTML parser, for such tasks as scanning HTML for keywords, contents, etc. | |
138 | */ | |
139 | ||
140 | class wxSimpleHtmlParser : public wxObject | |
141 | { | |
142 | ||
143 | public: | |
144 | wxSimpleHtmlParser(); | |
145 | ~wxSimpleHtmlParser(); | |
146 | ||
147 | //// Operations | |
148 | bool ParseFile(const wxString& filename); | |
149 | bool ParseString(const wxString& str); | |
150 | void Clear(); | |
151 | // Write this file | |
152 | void Write(wxOutputStream& stream); | |
153 | bool WriteFile(wxString& filename); | |
154 | ||
155 | //// Helpers | |
156 | ||
157 | // Main recursive parsing function | |
158 | bool ParseHtml(wxSimpleHtmlTag* parent); | |
159 | ||
160 | wxSimpleHtmlTag* ParseTagHeader(); | |
161 | wxSimpleHtmlTag* ParseTagClose(); | |
162 | bool ParseAttributes(wxSimpleHtmlTag* tag); | |
163 | wxSimpleHtmlTag* ParseDirective(); // e.g. <!DOCTYPE ....> | |
164 | wxSimpleHtmlTag* ParseXMLDeclaration(); // e.g. <?xml .... ?> | |
165 | bool ParseComment(); // Throw away comments | |
166 | // Plain text, up until an angled bracket | |
167 | bool ParseText(wxString& text); | |
168 | ||
169 | bool EatWhitespace(); // Throw away whitespace | |
170 | bool EatWhitespace(int& pos); // Throw away whitespace: using 'pos' | |
4fe30bce WS |
171 | bool ReadString(wxString& str, bool eatIt = false); |
172 | bool ReadWord(wxString& str, bool eatIt = false); | |
173 | bool ReadNumber(wxString& str, bool eatIt = false); | |
d7463f75 | 174 | // Could be number, string, whatever, but read up until whitespace. |
4fe30bce | 175 | bool ReadLiteral(wxString& str, bool eatIt = false); |
d7463f75 JS |
176 | |
177 | bool IsComment(); | |
178 | bool IsDirective(); | |
179 | bool IsXMLDeclaration(); | |
180 | bool IsString(); | |
181 | bool IsWord(); | |
182 | bool IsTagClose(); | |
183 | bool IsTagStartBracket(int ch); | |
184 | bool IsTagEndBracket(int ch); | |
185 | bool IsWhitespace(int ch); | |
186 | bool IsAlpha(int ch); | |
187 | bool IsWordChar(int ch); | |
188 | bool IsNumeric(int ch); | |
4fe30bce | 189 | // Check if a specific tag needs a close tag. If not this function should return false. |
d7463f75 | 190 | // If no close tag is needed the result will be that the tag will be insert in a none |
4fe30bce | 191 | // hierarchical way. i.e. if the function would return false all the time we would get |
d7463f75 JS |
192 | // a flat list of all tags (like it used to be previously). |
193 | virtual bool IsCloseTagNeeded(const wxString &name); | |
194 | ||
195 | // Encode/Decode Special Characters like: | |
196 | // > Begins a tag. > | |
197 | // < Ends a tag. < | |
198 | // " Quotation mark. " | |
199 | // ' Apostrophe. ' | |
200 | // & Ampersand. & | |
201 | static void DecodeSpecialChars(wxString &value); | |
202 | static wxString EncodeSpecialChars(const wxString &value); | |
203 | ||
204 | // Matches this string (case insensitive) | |
4fe30bce | 205 | bool Matches(const wxString& tok, bool eatIt = false) ; |
d7463f75 JS |
206 | bool Eof() const { return (m_pos >= m_length); } |
207 | bool Eof(int pos) const { return (pos >= m_length); } | |
208 | ||
209 | void SetPosition(int pos) { m_pos = pos; } | |
210 | ||
211 | ||
212 | //// Accessors | |
213 | wxSimpleHtmlTag* GetTopLevelTag() const { return m_topLevel; } | |
214 | ||
215 | // Safe way of getting a character | |
216 | int GetChar(size_t i) const; | |
217 | ||
218 | private: | |
219 | ||
220 | wxSimpleHtmlTag* m_topLevel; | |
221 | int m_pos; // Position in string | |
222 | int m_length; // Length of string | |
223 | wxString m_text; // The actual text | |
224 | ||
225 | }; | |
226 | ||
227 | /* | |
228 | * wxSimpleHtmlTagSpec | |
229 | * Describes a tag, and what type it is. | |
230 | * wxSimpleHtmlModule will initialise/cleanup a list of these, one per tag type | |
231 | */ | |
232 | ||
233 | #if 0 | |
234 | class wxSimpleHtmlTagSpec : public wxObject | |
235 | { | |
236 | ||
237 | public: | |
238 | wxSimpleHtmlTagSpec(const wxString& name, int type); | |
239 | ||
240 | //// Operations | |
241 | static void AddTagSpec(wxSimpleHtmlTagSpec* spec); | |
242 | static void Clear(); | |
243 | ||
244 | //// Accessors | |
245 | const wxString& GetName() const { return m_name; } | |
246 | int GetType() const { return m_type; } | |
247 | ||
248 | private: | |
249 | ||
250 | wxString m_name; | |
251 | int m_type; | |
252 | ||
253 | static wxList* sm_tagSpecs; | |
254 | }; | |
255 | ||
256 | /* | |
257 | * wxSimpleHtmlModule | |
258 | * Responsible for init/cleanup of appropriate data structures | |
259 | */ | |
260 | ||
261 | class wxSimpleHtmlModule : public wxModule | |
262 | { | |
263 | DECLARE_DYNAMIC_CLASS(wxSimpleHtmlModule) | |
264 | ||
265 | public: | |
266 | wxSimpleHtmlModule() {}; | |
267 | ||
268 | bool OnInit() ; | |
269 | void OnExit() ; | |
270 | }; | |
271 | #endif | |
272 | ||
273 | #endif |